- 前言
- 原有爬取脚本
- 修正后的爬取脚本
个人需要爬取一些论文资料,所以根据CSDN上已有的代码进行修改,完成了一个可以根据dblp检索结果爬取论文的脚本。arxiv.py是爬取检索结果中下载链接给的是arxiv开源链接的论文。ICML.py是爬取检索结果中下载链接给的是PMLR链接的论文。
原有爬取脚本最初的爬取脚本,转载自博客:https://blog.csdn.net/weixin_42781851/article/details/89086048
import urllib.request
import re
import os
req = urllib.request.Request('https://dblp.uni-trier.de/search?q=Reinforcement%20Learning%20Decision')
response = urllib.request.urlopen(req)
the_page = response.read().decode('utf-8')
paper_title = re.findall('(.*?)',the_page,re.S)
paper_web = re.findall('view- ',the_page,re.S)
def get_pdf_arxiv(web_site,path):
rep = urllib.request.urlopen(urllib.request.Request(web_site))
page = rep.read().decode('utf-8')
pdf_download = re.findall('',page,re.S)
print(pdf_download[0])
if (len(pdf_download) != 0):
try:
u = urllib.request.urlopen(pdf_download[0])
except urllib.error.HTTPError:
print(pdf_download[0], "url file not found")
return
block_sz = 8192
with open(path, 'wb') as f:
while True:
buffer = u.read(block_sz)
if buffer:
f.write(buffer)
else:
break
print("Sucessful to download " + path)
for i in range(len(paper_web)):
if (paper_web[i].find("arxiv") != -1):
list = paper_title[i].split(" ")
paper_title[i] = "_".join(list)
list = paper_title[i].split(":")
paper_title[i] = "_".join(list)
print(paper_title[i])
path_dir = "C:\Users\xxxx\Desktop\desktoppaper\RL_Decision\"
dir_list=os.listdir(path_dir)
path = paper_title[i] + "pdf"
if path not in dir_list:
get_pdf_arxiv(paper_web[i], path_dir + path)
非常感谢这位博主提供的爬取思路,但是在使用的过程中,出现了如下问题:
修正后的爬取脚本
1.dblp的页面有了一些变化,爬取会报错。
2.dblp采取了懒加载的策略,直接访问查询路径只能爬取最多30条数据。以下arxiv.py和上面原爬取脚本功能相似,并解决了懒加载的问题。将原有url改为dblp懒加载查询的url,并自增查询的页数b即可。可以根据查询的结果条数/30,来估计页数b的最大值。并且有些论文不是arxiv链接,则无法下载。
arxiv.py
import urllib.request import re import os def get_pdf_arxiv(web_site,path): rep = urllib.request.urlopen(urllib.request.Request(web_site)) page = rep.read().decode('utf-8') pdf_download = re.findall('',page,re.S)#查询到网页中对应的pdf下载链接 print(pdf_download[0]) if (len(pdf_download) != 0): try: u = urllib.request.urlopen(pdf_download[0]) except urllib.error.HTTPError: print(pdf_download[0], "url file not found") return block_sz = 8192 with open(path, 'wb') as f: while True: buffer = u.read(block_sz) if buffer: f.write(buffer) else: break print("Sucessful to download " + path) for j in range(10): req = urllib.request.Request('https://dblp.uni-trier.de/search//publ/inc?q=Offline%20Reinforcement%20Learning&s=ydvspc&h=30&b='+str(j))#此处只需要修改q=后面的内容,并且保留&s=ydvspc之后的内容 response = urllib.request.urlopen(req) the_page = response.read().decode('utf-8') paper_title = re.findall('(.*?)',the_page,re.S)#检索页面中给的论文名字 paper_web = re.findall('view- ',the_page,re.S)#存储对应网址
for i in range(len(paper_web)):
if (paper_web[i].find("arxiv") != -1): #查看网址中是否包含arxiv词汇
#以下代码去除了原有网页论文名字中无法保存为window文件名的符号,采用下划线_代替
paper_title[i] = paper_title[i].replace('"', '')
list = paper_title[i].split(" ")
paper_title[i] = "_".join(list)
list = paper_title[i].split(":")
paper_title[i] = "_".join(list)
list = paper_title[i].split("?")
paper_title[i] = "_".join(list)
list = paper_title[i].split("")
paper_title[i] = "_".join(list)
list = paper_title[i].split("")
paper_title[i] = "_".join(list)
list = paper_title[i].split("")
paper_title[i] = "_".join(list)
list = paper_title[i].split("")
paper_title[i] = "_".join(list)
print(paper_title[i])
path_dir = "E:\paper\offlineRL paper\"
dir_list=os.listdir(path_dir)
path = paper_title[i] + "pdf"
if path not in dir_list:
get_pdf_arxiv(paper_web[i], path_dir + path)
除此以外,我个人还想下一些ICML的论文,所以对于上面的代码进行了微调,将下载地址中需要包含arxiv改为需要包含proceedings,并且根据ICML的网页特性,找到了下载pdf的链接,完整代码在如下ICML.py中。
ICML.py
import urllib.request import re import os def get_pdf_arxiv(web_site,path): print(web_site,path) rep = urllib.request.urlopen(urllib.request.Request(web_site)) page = rep.read().decode('utf-8') # pdf_download = re.findall('',page,re.S) pdf_download= re.findall(' - Download PDF ',page,re.S) download = re.findall('http://(.*?).pdf',str(pdf_download[0]),re.S) download_url = "http://"+download[0]+".pdf" if (len(pdf_download) != 0): try: u = urllib.request.urlopen(download_url) except urllib.error.HTTPError: print(download_url, "url file not found") return block_sz = 8192 with open(path, 'wb') as f: while True: buffer = u.read(block_sz) if buffer: f.write(buffer) else: break print("Sucessful to download " + path) for j in range(100): req = urllib.request.Request('https://dblp.uni-trier.de/search//publ/inc?q=ICML&s=ydvspc&h=30&b='+str(j)) response = urllib.request.urlopen(req) the_page = response.read().decode('utf-8') paper_title = re.findall('(.*?)',the_page,re.S) paper_web = re.findall('view
- ',the_page,re.S) for i in range(len(paper_web)): # if (paper_web[i].find("arxiv") != -1): if (paper_web[i].find("proceedings") != -1): paper_title[i] = paper_title[i].replace('"', '') list = paper_title[i].split(" ") paper_title[i] = "_".join(list) list = paper_title[i].split(":") paper_title[i] = "_".join(list) list = paper_title[i].split("?") paper_title[i] = "_".join(list) list = paper_title[i].split("") paper_title[i] = "_".join(list) list = paper_title[i].split("") paper_title[i] = "_".join(list) list = paper_title[i].split("") paper_title[i] = "_".join(list) list = paper_title[i].split("") paper_title[i] = "_".join(list) print(paper_title[i]) path_dir = "E:\paper\ICML paper\" dir_list=os.listdir(path_dir) path = paper_title[i] + "pdf" if path not in dir_list: get_pdf_arxiv(paper_web[i], path_dir + path)
- ',the_page,re.S)#存储对应网址
for i in range(len(paper_web)):
if (paper_web[i].find("arxiv") != -1): #查看网址中是否包含arxiv词汇
#以下代码去除了原有网页论文名字中无法保存为window文件名的符号,采用下划线_代替
paper_title[i] = paper_title[i].replace('"', '')
list = paper_title[i].split(" ")
paper_title[i] = "_".join(list)
list = paper_title[i].split(":")
paper_title[i] = "_".join(list)
list = paper_title[i].split("?")
paper_title[i] = "_".join(list)
list = paper_title[i].split("")
paper_title[i] = "_".join(list)
list = paper_title[i].split("")
paper_title[i] = "_".join(list)
list = paper_title[i].split("")
paper_title[i] = "_".join(list)
list = paper_title[i].split("")
paper_title[i] = "_".join(list)
print(paper_title[i])
path_dir = "E:\paper\offlineRL paper\"
dir_list=os.listdir(path_dir)
path = paper_title[i] + "pdf"
if path not in dir_list:
get_pdf_arxiv(paper_web[i], path_dir + path)



