前几天做项目要用到天气数据,就写了个爬虫脚本爬了一些,希望可以帮到大家。
代码如下
import pandas as pd
from selenium import webdriver
import calendar
#输入你想要的
place=['shenzhen','shanghai','beijing','guangzhou','tianjin','wuhan','chongqing','xiamen']
start_year=2013
start_month=1
end_year=2021
end_month=6
end_day=20
path='C:/Users/86189/Desktop/论文数据/rawdata/地区气候/'
def crawl(place,start_year,start_month,end_year,end_month,end_day):
out_path = path + place + '.csv'
# 函数
def add_zero(number):
if 0 < number < 10:
return '0' + str(number)
else:
return str(number)
def make_date_range(start_year, start_month, end_year, end_month):
date_range = []
year_month = []
for j in range(start_month, 13):
monthstr = add_zero(j)
date_range.append(str(start_year) + monthstr)
year_month.append([start_year, j])
for year in range(start_year + 1, end_year):
for j in range(1, 13):
monthstr = add_zero(j)
date_range.append(str(year) + monthstr)
year_month.append([year, j])
for j in range(1, end_month + 1):
monthstr = add_zero(j)
date_range.append(str(end_year) + monthstr)
year_month.append([end_year, j])
return date_range, year_month
# 准备数据、爬虫
chrome_options = webdriver.ChromeOptions()
chrome_options.headless = True
chrome = webdriver.Chrome(chrome_options=chrome_options)
time_list, year_month = make_date_range(start_year, start_month, end_year, end_month)
url_list = []
for i in time_list:
url_list.append('https://lishi.tianqi.com/' + place + '/' + i + '.html')
out_data = [[], [], [], [], []]
# 运行爬虫
for i in range(len(url_list)):
chrome.get(url_list[i])
x, day_num = calendar.monthrange(year_month[i][0], year_month[i][1])
if i == len(url_list) - 1:
day_num = end_day
button = chrome.find_element_by_xpath('/html/body/div[7]/div[1]/div[4]/ul/div')
# 执行单击操作
button.click()
for day in range(day_num):
day += 1
xpath = '/html/body/div[7]/div[1]/div[4]/ul/li[' + str(day)
try:
for i in range(1, 6):
all_xpath = xpath + ']/div[' + str(i) + ']'
text = chrome.find_element_by_xpath(all_xpath).text
text = text.replace('℃', '')
if text[-3:-1] == '星期':
text = text[:-3]
out_data[i - 1].append(text)
except:
pass
# 数据导出
out_table = pd.Dataframe()
out_table['date'] = out_data[0]
out_table['hight_temprature(℃)'] = out_data[1]
out_table['low_temprature(℃)'] = out_data[2]
out_table['weather'] = out_data[3]
out_table['The direction of the wind'] = out_data[4]
out_table.to_csv(out_path, encoding='gbk')
chrome.quit()
for p in place:
crawl(p,start_year,start_month,end_year, end_month, end_day)



