介绍两种方式,一种是比较简单的方式,爬取的网站没有反爬措施,直接使用urllib和lxml库即可。
另一种例如新浪微博,想要获取某个博主完整的图片需要对页面进行下拉,则使用selenium,用自动化点击的方式来获取图片。
简单版:
import urllib.request
import re
from lxml import etree
import os
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
def checkfile(filepath):
if os.path.exists(filepath):
#shutil.rmtree(filepath)
return filepath
os.mkdir(filepath)
return filepath
def download(url,user_agent="wswp",proxy=None,num_retries=2):
print("Downloading: ",url)
headers={'User-agent':user_agent}
req=urllib.request.Request(url,headers=headers)
opener=urllib.request.build_opener()
if proxy:
proxy_params={urlparse.urlparse(url).scheme:proxy}
opener.add_handler(urllib.request.ProxyHandler(proxy_params))
try:
html=opener.open(req).read()
except urllib.request.URLError as e:
print("Download error",e.reason)
html=None
if num_retries>0:
if hasattr(e,"code") and 500 <=e.code<600:
return download(url,user_agent,proxy,num_retries-1)
return html
count=1
def get_image(html,filepath):
selector=etree.HTML(html)
#筛选想要抓取的图片地址,保存为list
#在浏览器检查查图片,然后复制xpath路径,去掉下标
#如 //*[@id="main"]/div[3]/ul/li[1]/a/span/img
imgurls = selector.xpath('//*[@id="main"]/div/ul/li/a/span/img/@src')
global count
count=1
f=open("imgurl.txt",'a+')
for i in imgurls:
i="https://pic.netbian.com"+i
print(i)
#保存图片网址
f.write(i+"n")
file_path="C:/Users/78565/Desktop/py3/%s/" % filepath +"%d.jpg" % count
if os.path.exists(file_path):
pass
else:
while 1:
try:
urllib.request.urlretrieve(i,"C:/Users/78565/Desktop/py3/%s/" % filepath +"%d.jpg" % count )
except:
print("net error")
break
continue
count+=1
f.close()
url="https://pic.netbian.com/"
filepath="img"
k=checkfile("C:/Users/78565/Desktop/py3/"+filepath)#创建文件夹
try:
#下载网页
html=download(url)
#从网页提取图片并下载
get_image(html,filepath)
except:
print("tasks end!")
Selenium版
首先获取cookies,需要下载chrome浏览器驱动放到chrome.exe同一目录下,并把路径加到环境变量,这样就可以不在chrome.exe目录下运行脚本
from selenium import webdriver
import time
import json
# 填写webdriver的保存目录
options = webdriver.ChromeOptions()
options.binary_location = r"C:Users78565AppDataLocalGoogleChromeApplicationchrome.exe"
driver = webdriver.Chrome(options=options)
# 记得写完整的url 包括http和https
driver.get('https://weibo.com/')
# 程序打开网页后20秒内 “手动登陆账户”
time.sleep(60)
with open('cookies.txt','w') as f:
# 将cookies保存为json格式
f.write(json.dumps(driver.get_cookies()))
driver.close()
打开浏览器,抓取图片
import re
import json
import selenium
from selenium import webdriver
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
import shutil
import requests
import urllib
import urllib.request
def scroll(browser):
try:
#出现网络超时,请重新点击
browser.find_element_by_xpath('//div[@]').click()
except:
pass
try:
#到达底部
browser.find_element_by_xpath('//div[@]').click()
except:
return False
return True
def checkfile(filepath):
if os.path.exists(filepath):
#shutil.rmtree(filepath)
return filepath
os.mkdir(filepath)
return filepath
def get_photo(lis,filepath):
x = 1
for i in lis:
i=i.strip("n")
print(i)
imgname =i[-21:]
#imgname=img[-21:]
print("开始下载")
file_path="C:/Users/78565/Desktop/py3/%s/"%filepath+imgname
if os.path.exists(file_path):
pass
else:
while 1:
try:
urllib.request.urlretrieve(i, "C:/Users/78565/Desktop/py3/%s/" % filepath+imgname)
except:
pass
break
print("第%s张图片下载完成!" % x)
x += 1
time.sleep(1)
def get_img(browser,txt):
for times in range(2):
while not scroll(browser):#如果没到页面底部,则进行页面拖动,拉取滚动条到底部
browser.execute_script("window.scrollBy(0,3000)")
time.sleep(1)
time.sleep(5)
print("***开始查找***")
#在全部图片的div下查找所有img元素
#在浏览器检查查图片,然后复制xpath路径,去掉下标
#如 //*[@id="main"]/div[3]/ul/li[1]/a/span/img 变为 //*[@id="main"]/div/ul/li/a/span/img
a=browser.find_elements_by_xpath('//*[@id="app"]/div/div/div/main/div/div/div/div/div/div/div/div/div/div/div/div/div/img')
lis_2=[]
f=open(txt,"w")
str1=''
for x in a:
key=x.get_attribute("src")
#查看大图
key=key.replace("orj360","large")
print(key)
f.write(key+"n")
lis_2.append(key)
#图片数量
print(len(lis_2))
return lis_2
if __name__ == '__main__':
options = webdriver.ChromeOptions()
options.binary_location = r"C:\Users78565AppDataLocalGoogleChromeApplicationchrome.exe"
url="https://weibo.com/u/2616380702?tabtype=album"
txt="{}.txt".format(url[20:30])
driver = webdriver.Chrome(chrome_options=options)
driver.get("http://photo.weibo.com/")
# 二、删除cookie
driver.delete_all_cookies()
# 三、添加上一步骤获得的cookie
with open('cookies.txt','r') as f:
# 使用json读取cookies 注意读取的是文件 所以用load而不是loads
cookies_list = json.load(f)
for cookie in cookies_list:
# 该字段有问题所以删除就可以
if 'expiry' in cookie:
del cookie['expiry']
driver.add_cookie(cookie)
#重新打开网址
driver.get(url)
imglist=get_img(driver,txt)
k=checkfile("C:/Users/78565/Desktop/py3/"+url[20:30])#创建文件夹
get_photo(imglist,url[20:30])
print("Finished!")
Python的网络库有时候总是卡住,可以先保存图片地址,再用curl进行下载,Linux下使用
#include#include #include #include #include #include using namespace std; //网址文件 #define TXT "url.txt" //保存图片的文件夹 #define DIR "url" //下载文件数据接收函数 size_t dl_req_reply(void *buffer, size_t size, size_t nmemb, void *user_p) { FILE *fp = (FILE *)user_p; size_t return_size = fwrite(buffer, size, nmemb, fp); //cout << (char *)buffer << endl; return return_size; } //http POST请求文件下载 CURLcode dl_curl_post_req(const string &url, const string &postParams, string filename) { FILE *fp = fopen(filename.c_str(), "wb"); // curl初始化 CURL *curl = curl_easy_init(); // curl返回值 CURLcode res; if (curl) { // set params //设置curl的请求头 struct curl_slist* header_list = NULL; header_list = curl_slist_append(header_list, "User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko Core/1.63.6788.400 QQBrowser/10.3.2843.400"); header_list = curl_slist_append(header_list, "Content-Type: application/x-www-form-urlencoded; charset=UTF-8"); curl_easy_setopt(curl, CURLOPT_HTTPHEADER, header_list); //不接收响应头数据0代表不接收 1代表接收 curl_easy_setopt(curl, CURLOPT_HEADER, 0); //设置请求为post请求 curl_easy_setopt(curl, CURLOPT_POST, 1); //设置请求的URL地址 curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); //设置post请求的参数 curl_easy_setopt(curl, CURLOPT_POSTFIELDS, postParams.c_str()); //设置ssl验证 curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false); curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, false); //CURLOPT_VERBOSE的值为1时,会显示详细的调试信息 curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); curl_easy_setopt(curl, CURLOPT_READFUNCTION, NULL); //设置数据接收函数和接收指针 curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &dl_req_reply); curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp); curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); //设置超时时间 //curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 6); //curl_easy_setopt(curl, CURLOPT_TIMEOUT, 6); // 开启post请求 res = curl_easy_perform(curl); } //释放curl curl_easy_cleanup(curl); //释放文件资源 fclose(fp); return res; } //下载文件数据接收函数 size_t get_reply(void *buffer, size_t size, size_t nmemb, void *user_p) { FILE *fp = (FILE *)user_p; size_t return_size = fwrite(buffer, size, nmemb, fp); //cout << (char *)buffer << endl; return return_size; } //http get请求文件下载 CURLcode get_curl_post_req(const string &url, const string &postParams, char* filename) { //------------------------------------------------------ //检查文件是否已存在 printf("%s n",filename); struct stat st; size_t len; if (stat(filename, &st)) { len=0; } else { len= static_cast (st.st_size); } printf("len %d n",len); if(len>0) { return CURLE_SEND_ERROR; } //----------------------------------------------------- FILE *fp = fopen(filename, "w+b"); // curl初始化 CURL *curl = curl_easy_init(); // curl返回值 CURLcode res; if (curl) { // set params //设置curl的请求头 struct curl_slist* header_list = NULL; header_list = curl_slist_append(header_list, "User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko Core/1.63.6788.400 QQBrowser/10.3.2843.400"); header_list = curl_slist_append(header_list, "Content-Type: application/x-www-form-urlencoded; charset=UTF-8"); curl_easy_setopt(curl, CURLOPT_HTTPHEADER, header_list); //不接收响应头数据0代表不接收 1代表接收 curl_easy_setopt(curl, CURLOPT_HEADER, 0); //设置请求的URL地址 curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); //设置ssl验证 curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, false); curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, false); //CURLOPT_VERBOSE的值为1时,会显示详细的调试信息 curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); //写入要发送的数据 curl_easy_setopt(curl, CURLOPT_READFUNCTION, NULL); //给上面的函数提供写入文件描述符 curl_easy_setopt(curl, CURLOPT_READDATA, NULL); //设置数据接收函数和接收指针 curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, &get_reply); curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp); curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); //设置超时时间 //curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 6); //curl_easy_setopt(curl, CURLOPT_TIMEOUT, 6); // 开启请求,默认为get res = curl_easy_perform(curl); } //释放curl curl_easy_cleanup(curl); //释放文件资源 fclose(fp); return res; } void get_img_url(string filename,vector &imglist) { FILE *fp = fopen(filename.c_str(), "rb"); char url[1024]={0}; while(fgets(url,1024,fp)) { cout< imageList; get_img_url(TXT,imageList); char buf[1024]={0}; char cmd[96]={0}; //======================================== //查看文件夹是否存在 if(popenRead("ls",buf)==0) { if(!strstr(buf,DIR)) { printf("mkdir %sn",DIR); memset(cmd,0,96); sprintf(cmd,"mkdir %s",DIR); system(cmd); } } //======================================== int count=1; for(size_t i=0; i



