Nginx反爬虫UA及限制并发

处理请求头文件agent-deny.conf

#禁止境内常见爬虫(根据需求自行控制是否禁止)
if ($http_user_agent ~* "qihoobot|Yahoo! Slurp China|Baiduspider|Baiduspider-image|spider|Sogou spider|Sogou web spider|Sogou inst spider|Sogou spider2|Sogou blog|Sogou News Spider|Sogou Orion spider|ChinasoSpider|Sosospider|YoudaoBot|yisouspider|EasouSpider|Tomato Bot|Scooter") { 
    return 403;
}

#禁止境外常见爬虫(根据需求自行控制是否禁止)
if ($http_user_agent ~* "Googlebot|Googlebot-Mobile|AdsBot-Google|Googlebot-Image|Mediapartners-Google|Adsbot-Google|Feedfetcher-Google|Yahoo! Slurp|MSNBot|Catall Spider|ArchitextSpider|AcoiRobot|Applebot|Bingbot|Discordbot|Twitterbot|facebookexternalhit|ia_archiver|linkedInBot|Naverbot|Pinterestbot|seznambot|Slurp|teoma|TelegramBot|Yandex|Yeti|Infoseek|Lycos|Gulliver|Fast|Grabber") { 
    return 403;
}

#禁止指定 UA 及 UA 为空的访问
if ($http_user_agent ~ "WinHttp|WebZIP|FetchURL|node-superagent|java/|Bytespider|FeedDemon|Jullo|JikeSpider|Indy Library|Alexa Toolbar|AskTbFXTV|AhrefsBot|CrawlDaddy|CoolpadWebkit|Java|Feedly|Apache-HttpAsyncClient|UniversalFeedParser|ApacheBench|Microsoft URL Control|Swiftbot|ZmEu|oBot|jaunty|Python-urllib|lightDeckReports Bot|YYSpider|DigExt|HttpClient|MJ12bot|heritrix|Ezooms|BOT/0.1|YandexBot|FlightDeckReports|Linguee Bot|iaskspider|^$") {
    return 403;             
}

#禁止非 GET|HEAD|POST 方式的抓取
if ($request_method !~ ^(GET|HEAD|POST)$) {
    return 403;
}

#禁止 Scrapy 等工具的抓取
#if ($http_user_agent ~* (Python|Java|Wget|Scrapy|Curl|HttpClient|Spider|PostmanRuntime)) {
if ($http_user_agent ~* (Scrapy|HttpClient)) {
    return 403;
}

#屏蔽IP地址
#deny 123.123.123.123
#慎用-封IP段(123.0.0.1-123.255.255.254)
#deny 123.0.0.0/8
#慎用-封IP段(123.123.0.1-123.123.255.254)
#deny 123.123.0.0/16
#慎用-封IP段(123.123.123.1-123.123.123.254)
#deny 123.123.123.0/24

#使用说明：在网站 xxx.conf 相关配置中的 server 段插入如下代码：
#include agent_deny.conf;

nginx.conf引入agent_deny.conf

http {
    #....
    #引入限制爬虫UA配置文件
    include agent_deny.conf;
    #....

    location =/robots.txt {# 爬虫规则说明，没有啥实际作用
        default_type text/html;
        add_header Content-Type "text/plain; charset=UTF-8";
        return 200 "User-agent: *nDisallow: /";
    }
}

nginx.conf添加限制并发控制

http {
    #....

    #漏桶算法
    #客户端IP限制并发连接数
    limit_conn_zone $binary_remote_addr zone=conip:10m;
    #客户端IP限制请求频率，每秒10次
    limit_req_zone $binary_remote_addr zone=reqip:10m rate=10r/s;
    location / {
        limit_conn conip 2;
        limit_req zone=reqip burst=20 nodelay;
#        limit_rate 512k;# 限制客户端下载速率
#        ...
#        #allow 192.168.1.0/24; #仅允许局域网IP段访问
#        #deny all; # 拒绝所有IP
#        proxy_set_header Host $host;
#        proxy_set_header X-Real-IP $remote_addr;
#        proxy_set_header REMOTE-HOST $remote_addr;
#        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
    }

    #....
}

重启nginx服务后，校验是否拦截爬虫UA

$ curl -I -A '' www.test.com
$ curl -X GET -I -A 'YYSpider' www.test.com
$ curl -X GET -I -A 'Baiduspider' www.test.com

Nginx反爬虫UA及限制并发

Python相关栏目本月热门文章