栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Java

java代理实现爬取代理IP的示例

Java 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

java代理实现爬取代理IP的示例

仅仅使用了一个java文件,运行main方法即可,需要依赖的jar包是com.alibaba.fastjson(版本1.2.28)和Jsoup(版本1.10.2)

如果用了pom,那么就是以下两个:


  com.alibaba
  fastjson
  1.2.28


  org.jsoup
  jsoup
  1.10.2

完整的代码如下:

package com.tuniu.fcm.facade.IPProxy;
import com.alibaba.fastjson.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.document;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ProxyCralwerUnusedVPN {
  ThreadLocal localWantedNumber = new ThreadLocal();
  ThreadLocal> localProxyInfos = new ThreadLocal>();
  public static void main(String[] args) {
    ProxyCralwerUnusedVPN proxyCrawler = new ProxyCralwerUnusedVPN();
    
    proxyCrawler.startCrawler(1);
  }
  
  public String startCrawler(int wantedNumber) {
    localWantedNumber.set(wantedNumber);
    kuaidailiCom("http://www.xicidaili.com/nn/", 15);
    kuaidailiCom("http://www.xicidaili.com/nt/", 15);
    kuaidailiCom("http://www.xicidaili.com/wt/", 15);
    kuaidailiCom("http://www.kuaidaili.com/free/inha/", 15);
    kuaidailiCom("http://www.kuaidaili.com/free/intr/", 15);
    kuaidailiCom("http://www.kuaidaili.com/free/outtr/", 15);
    
    ProxyResponse response = new ProxyResponse();
    response.setSuccess("true");
    Map dataInfoMap = new HashMap();
    dataInfoMap.put("numFound", localProxyInfos.get().size());
    dataInfoMap.put("pageNum", 1);
    dataInfoMap.put("proxy", localProxyInfos.get());
    response.setData(dataInfoMap);
    String responseString = JSONObject.toJSON(response).toString();
    System.out.println(responseString);
    return responseString;
  }
  private void kuaidailiCom(String baseUrl, int totalPage) {
    String ipReg = "\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} \d{1,6}";
    Pattern ipPtn = Pattern.compile(ipReg);
    for (int i = 1; i < totalPage; i++) {
      if (getCurrentProxyNumber() >= localWantedNumber.get()) {
 return;
      }
      try {
 document doc = Jsoup.connect(baseUrl + i + "/")
     .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
     .header("Accept-Encoding", "gzip, deflate, sdch")
     .header("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6")
     .header("Cache-Control", "max-age=0")
     .header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36")
     .header("cookie", "Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244; _gat=1; _ga=GA1.2.1061361785.1462812244")
     .header("Host", "www.kuaidaili.com")
     .header("Referer", "http://www.kuaidaili.com/free/outha/")
     .timeout(30 * 1000)
     .get();
 Matcher m = ipPtn.matcher(doc.text());
 while (m.find()) {
   if (getCurrentProxyNumber() >= localWantedNumber.get()) {
     break;
   }
   String[] strs = m.group().split(" ");
   if (checkProxy(strs[0], Integer.parseInt(strs[1]))) {
     System.out.println("获取到可用代理IPt" + strs[0] + "t" + strs[1]);
     addProxy(strs[0], strs[1], "http");
   }
 }
      } catch (Exception e) {
 e.printStackTrace();
      }
    }
  }
  private static boolean checkProxy(String ip, Integer port) {
    try {
      //http://1212.ip138.com/ic.asp 可以换成任何比较快的网页
      Jsoup.connect("http://1212.ip138.com/ic.asp")
   .timeout(2 * 1000)
   .proxy(ip, port)
   .get();
      return true;
    } catch (Exception e) {
      return false;
    }
  }
  private int getCurrentProxyNumber() {
    List proxyInfos = localProxyInfos.get();
    if (proxyInfos == null) {
      proxyInfos = new ArrayList();
      localProxyInfos.set(proxyInfos);
      return 0;
    }
    else {
      return proxyInfos.size();
    }
  }
  private void addProxy(String ip, String port, String protocol){
    List proxyInfos = localProxyInfos.get();
    if (proxyInfos == null) {
      proxyInfos = new ArrayList();
      proxyInfos.add(new ProxyInfo(ip, port, protocol));
    }
    else {
      proxyInfos.add(new ProxyInfo(ip, port, protocol));
    }
  }
}
class ProxyInfo {
  private String userName = "";
  private String ip;
  private String password = "";
  private String type;
  private String port;
  private int is_internet = 1;
  public ProxyInfo(String ip, String port, String type) {
    this.ip = ip;
    this.type = type;
    this.port = port;
  }
  public String getUserName() {
    return userName;
  }
  public void setUserName(String userName) {
    this.userName = userName;
  }
  public String getIp() {
    return ip;
  }
  public void setIp(String ip) {
    this.ip = ip;
  }
  public String getPassword() {
    return password;
  }
  public void setPassword(String password) {
    this.password = password;
  }
  public String getType() {
    return type;
  }
  public void setType(String type) {
    this.type = type;
  }
  public String getPort() {
    return port;
  }
  public void setPort(String port) {
    this.port = port;
  }
  public int getIs_internet() {
    return is_internet;
  }
  public void setIs_internet(int is_internet) {
    this.is_internet = is_internet;
  }
}
class ProxyResponse {
  private String success;
  private Map data;
  public String getSuccess() {
    return success;
  }
  public void setSuccess(String success) {
    this.success = success;
  }
  public Map getData() {
    return data;
  }
  public void setData(Map data) {
    this.data = data;
  }
}

以上这篇java代理实现爬取代理IP的示例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持考高分网。

转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/140982.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号