想批量下载万方数据库的文献,看了一下其html源码不好玩啊.
其一篇文献的下载的链接.
下 载
onclick 事件
onclick 事件会在对象被点击时发生。
请注意, onclick 与 onmousedown 不同。单击事件是在同一元素上发生了鼠标按下事件之后又发生了鼠标放开事件时才发生的。
语法:onclick="SomeJavascriptCode"
找到upload函数。
function upload(page_cnt,id,language,source_db,title,isoa,type,resourceType){
title=window.encodeURI(window.encodeURI(title));
var type = $("#document_type").val();
if(type == "standards"){
type="standard";
}
window.open("/search/downLoad.do?page_cnt="+page_cnt+"&language="+language+"&resourceType="+type+"&source="+source_db+"&resourceId="+id+"&resourceTitle="+title+"&isoa="+isoa+"&type="+type);
}
function onlineReading(page_cnt,id,language,source_db,title,isoa,type,resourceType){
title=window.encodeURI(window.encodeURI(title));
var type = $("#document_type").val();
if(type == "standards"){
type="standard";
}
window.open("/search/onlineread.do?page_cnt="+page_cnt+"&language="+language+"&resourceType="+type+"&source="+source_db+"&resourceId="+id+"&resourceTitle="+title+"&isoa="+isoa+"&type="+type);
}
以Spectral Efficiency and Power Allocation for Mixed-ADC Massive MIMO System这篇文献为列子,起下载事件为。
下 载
点击下载,获取了一个url, 好像随机生成hash值
http://f.wanfangdata.com.cn/www/Spectral+Efficiency+and+Power+Allocation+for+Mixed-ADC+Massive+MIMO+System.ashx?type=perio&resourceId=zgtx201803009&resourceTitle=Spectral%2BEfficiency%2Band%2BPower%2BAllocation%2Bfor%2BMixed-ADC%2BMassive%2BMIMO%2BSystem&transaction=%7B%22id%22%3Anull%2C%22transferOutAccountsStatus%22%3Anull%2C%22transaction%22%3A%7B%22id%22%3A%22998101496136486912%22%2C%22status%22%3A1%2C%22createDateTime%22%3Anull%2C%22payDateTime%22%3A1526800915165%2C%22authToken%22%3A%22TGT-10848458-zHl3CXey47UjQav6HqMOisw3CZqNxO6NBjA4fvtzkCQ1tXPRcu-my.wanfangdata.com.cn%22%2C%22user%22%3A%7B%22accountType%22%3A%22Group%22%2C%22key%22%3A%22hbdesf%22%7D%2C%22transferIn%22%3A%7B%22accountType%22%3A%22Income%22%2C%22key%22%3A%22PeriodicalFulltext%22%7D%2C%22transferOut%22%3A%7B%22GTimeLimit.hbdesf%22%3A3.0%7D%2C%22turnover%22%3A3.0%2C%22productDetail%22%3A%22perio_zgtx201803009%22%2C%22productTitle%22%3Anull%2C%22userIP%22%3A%22202.110.130.244%22%2C%22organName%22%3Anull%2C%22memo%22%3Anull%2C%22webTransactionRequest%22%3Anull%2C%22signature%22%3A%22I6p3Hq9DM8nnf3U1DVVw4lZcQAF1mxcJWmNcnUpeTMY5I6jkhJtlDHrujdJa6SsKqZ26E52RnHDO%5CntPqYeEFZ6laDAwSRs0U3xwr%2FU3CS7w8zuvg8XyHEym9ufvCyJElsxwP0fSq5GMI0EaNwv45SoqQ7%5CnVI1Bhel0QUD1KVa0TFQ%3D%22%2C%22delete%22%3Afalse%7D%2C%22isCache%22%3Afalse%7D
先用浏览器访问下url,如果可以得到数据,就可以使用requests的get方法,如果不能就使用post方法
想采用requests库得到html文本.
def get_html(url):
try:
header ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331', }
r=requests.get(url,headers = header,verify=False)
r.raise_for_status
r.encoding=r.apparent_encoding
#print(r.text)
return r
except Exception as e:
print("has error:"+str(e))
万方数据库一页显示20也结果
搜索页的url
http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&showType=&searchWord=cd&isTriggerTag= http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&pageSize=20&page=2&searchWord=cd&order=correlation&showType=detail&isCheck=check&isHit=&isHitUnit=&firstAuthor=false&rangeParame=all http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&pageSize=20&page=3&searchWord=cd&order=correlation&showType=detail&isCheck=check&isHit=&isHitUnit=&firstAuthor=false&rangeParame=all
搜索的结果在strong标签中.采用正则表达式,可以得到搜索结果.
找到168533条结果。
def getNum(key):
head="http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&showType=&searchWord="
end="&isTriggerTag="
url=head+key+end
re1=r's*找到(.*?)条结果'
html=get_html(url).text
if html==None:
print("没有文献")
return ;
strnum=re.findall(re1,html)
num=int(strnum[0])
#print("找到了:",num)
return num;
根据关键字key和总结果可以构建出搜索的页面.
def search_key(key):
allurl=[]
page=0
head="http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&showType=&searchWord="
end="&isTriggerTag="
url=head+key+end
#print(url)
allurl.append(url)
html=get_html(url).text
if html==None:
print("text empty")
return ;
num=getNum(key)
print("找到了:",num)
if num>20:
if(num%20!=0):
page=num//20+1
else:
page=num//20
# page>1 url
head='http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&pageSize=20&page='
end='&searchWord='+key+'&order=correlation&showType=detail&isCheck=check&isHit=&isHitUnit=&firstAuthor=false&rangeParame=all'
for i in range(2,page+1):
url=head+str(i)+end
allurl.append(url)
l=len(allurl)
print('第',l,"页")
print(allurl[0])
print(allurl[l-1])
return allurl
这是每一页的具体url
def get_url(urls):
base='http://www.wanfangdata.com.cn//link.do'
html=get_html(urls).text
#re0=r''
re0=r']*bhref="/link.do?([^"]+)'
allUrl=re.findall(re0,html)
length=len(allUrl)
print("length=",length)
for i in range(length):
allUrl[i]=base+allUrl[i]
#print(allUrl)
return allUrl
总共的页数已经得到了,但是js卡注了,不知道怎么生成相关的下载hrfe。已经根据这个文件,下载下来相关的pdf文档了。
def get_pdf(url):
text=get_html(url)
path="/home/dflx/文档/python/6.pdf"
with open(path,'wb') as f:
f.write(text.content)
print("successf")
所以现在做不下去了,于是决定先把每一篇文章的题目,页数,等重要信息爬虫下来,写入excel文件,看一看.
"""
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun May 20 10:05:29 2018
@author: dflx
"""
import requests
import time
import re
import os
from bs4 import BeautifulSoup
import bs4
from urllib import parse
from multiprocessing import Pool
import xlwt
def get_html(url):
try:
header ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331', }
r=requests.get(url,headers = header,verify=False)
r.raise_for_status
r.encoding=r.apparent_encoding
#print(r.text)
return r
except Exception as e:
print("has error:"+str(e))
def getNum(key):
head="http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&showType=&searchWord="
end="&isTriggerTag="
url=head+key+end
re1=r's*找到(.*?)条结果'
html=get_html(url).text
if html==None:
print("没有文献")
return ;
strnum=re.findall(re1,html)
num=int(strnum[0])
#print("找到了:",num)
return num;
def search_key(key):
allurl=[]
page=0
head="http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&showType=&searchWord="
end="&isTriggerTag="
url=head+key+end
#print(url)
allurl.append(url)
html=get_html(url).text
if html==None:
print("text empty")
return ;
num=getNum(key)
print("找到了:",num)
if num>20:
if(num%20!=0):
page=num//20+1
else:
page=num//20
# page>1 url
head='http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&pageSize=20&page='
end='&searchWord='+key+'&order=correlation&showType=detail&isCheck=check&isHit=&isHitUnit=&firstAuthor=false&rangeParame=all'
for i in range(2,page+1):
url=head+str(i)+end
allurl.append(url)
l=len(allurl)
print('第',l,"页")
print(allurl[0])
print(allurl[l-1])
return allurl
def get_pdf(url):
text=get_html(url)
path="/home/dflx/文档/python/6.pdf"
with open(path,'wb') as f:
f.write(text.content)
print("successf")
def get_information(url):
wenben=get_html(url).text
soup=BeautifulSoup(wenben,'html.parser')
title=[]
information=[]
title=soup.find_all('title')
information.append('文章的题目')
information.append(soup.title.string)
print("文章的题目:",soup.title.string)
print("--------中文摘要------------")
abstract=[]
abstract=soup.find_all('textarea')
soup_abstr=BeautifulSoup(str(abstract),'html.parser')
if len(abstract)!=0:
print(len(abstract),abstract[1].string)
if len(abstract)!=0 and abstract[1].string!=None:
information.append("中文摘要")
information.append(abstract[1].string)
print("--------中文摘要------------")
english_abstract=[]
re0=r's*(.*?)s*


