查看以下实现。我使用
requests模块而不是进行
urllib下载。而且,我使用
.select()method
.find_all()来避免使用
re。
import osimport requestsfrom urllib.parse import urljoinfrom bs4 import BeautifulSoupurl = "http://www.gatsby.ucl.ac.uk/teaching/courses/ml1-2016.html"#If there is no such folder, the script will create one automaticallyfolder_location = r'E:webscraping'if not os.path.exists(folder_location):os.mkdir(folder_location)response = requests.get(url)soup= BeautifulSoup(response.text, "html.parser") for link in soup.select("a[href$='.pdf']"): #Name the pdf files using the last portion of each link which are unique in this case filename = os.path.join(folder_location,link['href'].split('/')[-1]) with open(filename, 'wb') as f: f.write(requests.get(urljoin(url,link['href'])).content)


