栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

python爬虫

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

python爬虫

爬取网页
    req = urllib.request.Request(url)
    req.add_header('user-agent',
                   'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
    response = urllib.request.urlopen(req)
    html = response.read()
res = requests.get(img_url, headers = headers)
resp.text          //  返回的是一个经过解码后的字符串,是unicode类型
resp.content    // 返回的是一个原生字符串,是bytes类型
解析网页
soup = BeautifulSoup(html, 'html.parser')
    imgs = soup.select('.slist')[0]
    imgs1 = imgs.find_all('img')
    imgs2 = imgs.find_all('a')
    imag_urls = []
    imag_titles = []
    for img in imgs1:
        imag_titles.append(img.get("alt"))
    for img in imgs2:
        imag_urls.append( s_url + img.get('href'))
    return  imag_titles, imag_urls

打印一下 soup 对象的内容,格式化输出

print soup.prettify()
 imgs = soup.select('.slist')[0]#获取class slist
 imgs1 = imgs.find_all('img')#获取标签 img
 imag_titles.append(img.get("alt"))#获取属性alt
下载文件
 path = './out/qc_picture/'
    if not os.path.exists(path):
        os.makedirs(path)
    res = requests.get(img_url, headers = headers)
    if res.status_code == 200:
        with open(path + name + str(page) + "页.jpg", "wb") as f:
            f.write(res.content)
    print(name + "-----下载完成")
注意事项
socket.setdefaulttimeout(1) #下载超过1s就跳过
res.close() #关闭请求
time.sleep(0.1)# 等待0.1s
实例
import urllib.request
import os
import requests
import time
import socket
from bs4 import BeautifulSoup
socket.setdefaulttimeout(1)
headers = {'Referer':'https://www.mzitu.com','User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3679.0 Safari/537.36'}#请求头,加上referer突破防盗链
url1 = "https://www.mzitu.com/xinggan"
url = "https://www.mzitu.com/mm"
def get_url(url):
    req = urllib.request.Request(url)
    req.add_header('user-agent',
                   'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36')
    response = urllib.request.urlopen(req)
    html = response.read()
    return html.decode("utf-8")#网站utf-8编码
def get_images(html):
    soup = BeautifulSoup(html, 'html.parser')
    imgs = soup.select('.postlist')[0]
    imgs1 = imgs.find_all('img')
    imgs2 = imgs.find_all('a')
    imag_urls = []
    imag_titles = []
    k = 0
    for img in imgs1:
        imag_titles.append(img.get("alt"))
    for img in imgs2:
        if k % 2:
            imag_urls.append(img.get("href"))
        k += 1
    return  imag_titles, imag_urls

def get_img(url):
    html2 = get_url(url)
    soup = BeautifulSoup(html2,"html.parser")
    soup = soup.select('.main-image')
    soup = soup[0].find_all("img")
    soup = soup[0].get("src")
    return soup

def save_imgs(img_url, page, name):
    path = './out/qc_picture/'
    if not os.path.exists(path):
        os.makedirs(path)
    res = requests.get(img_url, headers = headers)
    if res.status_code == 200:
        with open(path + name + str(page) + "页.jpg", "wb") as f:
            f.write(res.content)
    print(name + "-----下载完成")
    res.close()
    time.sleep(0.1)
def download(l, r):
    sum = 0
    for i in range(l-1, r):
        url_p = url + "/page/" + str(i + 1) + "/"
        html = get_url(url_p)
        list1, list2 = get_images(html)
        sum += len(list1)
        for j in range(len(list2)):
            url_img = get_img(list2[j] + "/1")
            save_imgs(url_img, i + 1, list1[j])
    print("一共下载" + str(sum) + "张图片")

download(5, 5)
转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/580775.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号