twitter数据爬取

继上次（爬取twitter数据_Chloris_的博客-CSDN博客）在github上寻找代码看不懂的后续尝试：

其中包含selenium登录&异步加载&xpath--由于twitter仅展现近一个周的数据，所以当前数据爬取也是不全面的，还需要继续调整代码。

from selenium import webdriver
import time
from datetime import datetime
from datetime import timedelta
import pandas as pd
import requests
import json
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import random
import logging
import urllib.error
from lxml import etree
from lxml import html

# 获取页面内所有帖子的url
def get_posts(url):
    """
    url:包含所有帖子的浏览页面
    """
    wb = webdriver.Chrome()
    wb.get(url)
    time.sleep(3)
    
#处理网页加载
    js = 'return action=document.body.scrollHeight'
    height = wb.execute_script(js)
    wb.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    time.sleep(5)

    t1 = int(time.time())
    status = True
    num = 0
    
    post_list = []

    while status:
        t2 = int(time.time())
        if t2 - t1 < 30:#一边翻页一边读取网页源码，由于twitter异步加载后翻页的源码不全，所以在翻页过程中获取网页源码，但是获取的数据需要进行去重处理
            selector = html.etree.HTML(wb.page_source)# # 是将HTML转化为二进制/html 格式
            infos =  selector.xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]")
            for info in infos:
                post = info.xpath("string(.)").strip()
                post_list.append(post)
            new_height = wb.execute_script(js)
            if new_height > height:
                time.sleep(1)
                wb.execute_script(
                    'window.scrollTo(0, document.body.scrollHeight)')
                height = new_height
                t1 = int(time.time())
        elif num < 3:
            time.sleep(3)
            num = num + 1
        else:  # 超时且重试后停止，到底页面底部
            status = False
    return post_list

url = 'https://twitter.com/search?q=nuclear%20waste%20water&src=typed_query'
post_list = get_posts(url)

twitter数据爬取

Python相关栏目本月热门文章