python爬虫

爬取网页

    req = urllib.request.Request(url)
    req.add_header('user-agent',
                   'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
    response = urllib.request.urlopen(req)
    html = response.read()

res = requests.get(img_url, headers = headers)
resp.text          //  返回的是一个经过解码后的字符串，是unicode类型
resp.content    // 返回的是一个原生字符串，是bytes类型

解析网页

soup = BeautifulSoup(html, 'html.parser')
    imgs = soup.select('.slist')[0]
    imgs1 = imgs.find_all('img')
    imgs2 = imgs.find_all('a')
    imag_urls = []
    imag_titles = []
    for img in imgs1:
        imag_titles.append(img.get("alt"))
    for img in imgs2:
        imag_urls.append( s_url + img.get('href'))
    return  imag_titles, imag_urls

打印一下 soup 对象的内容，格式化输出

print soup.prettify()

 imgs = soup.select('.slist')[0]#获取class slist
 imgs1 = imgs.find_all('img')#获取标签 img
 imag_titles.append(img.get("alt"))#获取属性alt

下载文件

 path = './out/qc_picture/'
    if not os.path.exists(path):
        os.makedirs(path)
    res = requests.get(img_url, headers = headers)
    if res.status_code == 200:
        with open(path + name + str(page) + "页.jpg", "wb") as f:
            f.write(res.content)
    print(name + "-----下载完成")

注意事项

socket.setdefaulttimeout(1) #下载超过1s就跳过
res.close() #关闭请求
time.sleep(0.1)# 等待0.1s

实例

import urllib.request
import os
import requests
import time
import socket
from bs4 import BeautifulSoup
socket.setdefaulttimeout(1)
headers = {'Referer':'https://www.mzitu.com','User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3679.0 Safari/537.36'}#请求头，加上referer突破防盗链
url1 = "https://www.mzitu.com/xinggan"
url = "https://www.mzitu.com/mm"
def get_url(url):
    req = urllib.request.Request(url)
    req.add_header('user-agent',
                   'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
    response = urllib.request.urlopen(req)
    html = response.read()
    return html.decode("utf-8")#网站utf-8编码
def get_images(html):
    soup = BeautifulSoup(html, 'html.parser')
    imgs = soup.select('.postlist')[0]
    imgs1 = imgs.find_all('img')
    imgs2 = imgs.find_all('a')
    imag_urls = []
    imag_titles = []
    k = 0
    for img in imgs1:
        imag_titles.append(img.get("alt"))
    for img in imgs2:
        if k % 2:
            imag_urls.append(img.get("href"))
        k += 1
    return  imag_titles, imag_urls

def get_img(url):
    html2 = get_url(url)
    soup = BeautifulSoup(html2,"html.parser")
    soup = soup.select('.main-image')
    soup = soup[0].find_all("img")
    soup = soup[0].get("src")
    return soup

def save_imgs(img_url, page, name):
    path = './out/qc_picture/'
    if not os.path.exists(path):
        os.makedirs(path)
    res = requests.get(img_url, headers = headers)
    if res.status_code == 200:
        with open(path + name + str(page) + "页.jpg", "wb") as f:
            f.write(res.content)
    print(name + "-----下载完成")
    res.close()
    time.sleep(0.1)
def download(l, r):
    sum = 0
    for i in range(l-1, r):
        url_p = url + "/page/" + str(i + 1) + "/"
        html = get_url(url_p)
        list1, list2 = get_images(html)
        sum += len(list1)
        for j in range(len(list2)):
            url_img = get_img(list2[j] + "/1")
            save_imgs(url_img, i + 1, list1[j])
    print("一共下载" + str(sum) + "张图片")

download(5, 5)

python爬虫

Python相关栏目本月热门文章