python selenium库的使用（一）

反爬

请求头的user_agent字段都含有“Scrapy”关键字，很容易被识别出来是爬虫，所以我们可以伪装成浏览器。通过浏览器随意打开一个网站，在开发者工具里面可以看到这个浏览器的user_agent。

整体代码如下：

import time
from selenium import webdriver
from models import *


def get_driver():
    options = webdriver.ChromeOptions()
    # options.add_argument('--blink-settings=imagesEnabled=false')
    # options.add_argument('--headless')
    options.add_argument('--window-size=1440,1080')
    options.add_argument('--disable-extensions')
    options.add_argument('--no-sandbox')  # run Chrome use root
    options.add_argument('--disable-setuid-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    options.add_argument(
        'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"')
    driver = webdriver.Chrome(options=options)
    return driver

def get_data(driver, keys, main_key):
    url = f'https://xxxx.com/en-us/s/{keys}?businessesPage={p}&query={main_key}'
    driver.get(url)
    tt = driver.find_element_by_class_name('purify_1C2sKfbn9OVsW').find_elements_by_css_selector(
            ".purify_T9Yll5MsAPX > a")
    ad = driver.find_element_by_css_selector(
                "[class='purify_X purify_2SvKv3MmhxAqq-wQiZJQc3 purify_g4p_j_IN6T']")
    ff = ad.find_element_by_css_selector(
                    "[class='purify_1sQU5pf3yAvt purify_3k1NnTEGO6TSunXbY5Zrkx']").text
.......

if __name__ == '__main__':
    main_key = ''
    keys_list = ["a", "b"]
    driver = get_driver()
    for k in keys_list:
        get_data(driver, k, main_key)

import time
from selenium import webdriver
from models import *


def get_driver():
    options = webdriver.ChromeOptions()
    # options.add_argument('--blink-settings=imagesEnabled=false')
    # options.add_argument('--headless')
    options.add_argument('--window-size=1440,1080')
    options.add_argument('--disable-extensions')
    options.add_argument('--no-sandbox')  # run Chrome use root
    options.add_argument('--disable-setuid-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_experimental_option('excludeSwitches', ['enable-automation'])
    options.add_argument(
        'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36"')
    driver = webdriver.Chrome(options=options)
    return driver


def get_data():
    driver = get_driver()
    url = f'https://www.baidu.com/'
    driver.get(url)
......

if __name__ == '__main__':
    get_data()

数据库

    #写入字典
    datalist.append(dict(name=name, phone=phone, address=address, source="aaa"))
    #遍历字典，数据库不存在该name就写入数据库
    for d in datalist:
        find_data = SQLsession.query(Infos).filter_by(name=d['name']).first()
        if not find_data:
            SQLsession.add(Infos(**d))
    SQLsession.commit()

models

orm

from sqlalchemy import *
import pymysql
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
from datetime import datetime
database = 'mysql+pymysql://root:密码@ip地址或localhost/数据库名?charset=utf8mb4'

base = declarative_base()
# 创建数据库连接对象
engine = create_engine(database)
DBSession = sessionmaker(bind=engine)
SQLsession = DBSession()


# ORM
class Infos(base):
    __tablename__ = '表名'
    id = Column(Integer(), primary_key=True)
    code = Column(String(255))
    name = Column(String(255))
    status = Column(Integer(), default=1)
    remark = Column(Text)
    created = Column(DateTime, default=datetime.now())
    updated = Column(DateTime, default=datetime.now(), onupdate=datetime.now())

base.metadata.create_all(engine)

随机代理IP

对于封IP的网站，延时的方法使得爬虫效率大大下降。更好的办法是使用IP代理，每次向服务器请求使用不同的IP。

python selenium库的使用（一）

Python相关栏目本月热门文章