爬取某直播平台所有正在直播的房间信息
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
class Douyu(object):
def __init__(self):
#设置url
self.url="https://www.douyu.com/directory/all"
#创建浏览器driver对象
self.driver=webdriver.Chrome()
def parse_data(self):
#睡眠三秒后再获取元素列表,否则元素还没加载出来就获取了会报错
time.sleep(3)
room_list=self.driver.find_elements(By.XPATH,'//*[@id="listAll"]/section[2]/div[2]/ul/li/div')
print(len(room_list))
data_list=[]
#循环遍历元素列表,从每个元素中取出需要的数据进行保存
for room in room_list:
temp = {}
temp['title']=room.find_element(By.XPATH,'./a/div[2]/div[1]/h3').text
temp['type']=room.find_element(By.XPATH,'./a/div[2]/div[1]/span').text
temp['owner']=room.find_element(By.XPATH,'./a[1]/div[2]/div[2]/h2').text
temp['num']=room.find_element(By.XPATH,'./a[1]/div[2]/div[2]/span').text
data_list.append(temp)
return data_list
#循环打印出列表中的每个词典
def save_data(self, data_list):
for data in data_list:
print(data)
def run(self):
self.driver.get(self.url)
while True:
# parse
data_list = self.parse_data()
# save
self.save_data(data_list)
# next
try:
#执行js语法,让滚轮往下滑动
self.driver.execute_script('scrollTo(0,1000000)')
#找下一页的元素,并点击
el_next = self.driver.find_element(By.XPATH,'//*[contains(text(),"下一页")]')
el_next.click()
except:
break
if __name__ == '__main__':
dy=Douyu()
dy.run()