继上次(爬取twitter数据_Chloris_的博客-CSDN博客)在github上寻找代码看不懂的后续尝试:
其中包含selenium登录&异步加载&xpath--由于twitter仅展现近一个周的数据,所以当前数据爬取也是不全面的,还需要继续调整代码。
from selenium import webdriver
import time
from datetime import datetime
from datetime import timedelta
import pandas as pd
import requests
import json
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import random
import logging
import urllib.error
from lxml import etree
from lxml import html
# 获取页面内所有帖子的url
def get_posts(url):
"""
url:包含所有帖子的浏览页面
"""
wb = webdriver.Chrome()
wb.get(url)
time.sleep(3)
#处理网页加载
js = 'return action=document.body.scrollHeight'
height = wb.execute_script(js)
wb.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(5)
t1 = int(time.time())
status = True
num = 0
post_list = []
while status:
t2 = int(time.time())
if t2 - t1 < 30:#一边翻页一边读取网页源码,由于twitter异步加载后翻页的源码不全,所以在翻页过程中获取网页源码,但是获取的数据需要进行去重处理
selector = html.etree.HTML(wb.page_source)# # 是将HTML转化为二进制/html 格式
infos = selector.xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]")
for info in infos:
post = info.xpath("string(.)").strip()
post_list.append(post)
new_height = wb.execute_script(js)
if new_height > height:
time.sleep(1)
wb.execute_script(
'window.scrollTo(0, document.body.scrollHeight)')
height = new_height
t1 = int(time.time())
elif num < 3:
time.sleep(3)
num = num + 1
else: # 超时且重试后停止,到底页面底部
status = False
return post_list
url = 'https://twitter.com/search?q=nuclear%20waste%20water&src=typed_query'
post_list = get_posts(url)



