导包:
org.jsoup jsoup1.13.1
访问链家二手房网站可以发现,单页30条数据,最多100页数据。但总量可能在几万到十几万条,所以需要利用到搜索条件进行抓取。以下以北京为例:
可以看到选中某区域后url会固定改变所以可以根据这个规则进行抓取所有的url:
public static ListgetLianJiaAllUrl(String url) throws IOException { List allUrl = new ArrayList<>(); document doc = JsoupUtil.getDocByProxyTwo(url); Elements e = doc.getElementsByAttributevalue("data-role","ershoufang").select("a"); List mainUrls = new ArrayList<>(); for(Element element : e) { mainUrls.add(url.substring(0, url.indexOf(".com")) + ".com/" + element.attr("href")); } if(!CollectionUtils.isEmpty(mainUrls)){ mainUrls.parallelStream().forEach(main->{ try { document document = JsoupUtil.getDocByProxyTwo(main); Elements elements = document.getElementsByClass("total fl").select("span"); for(Element e1 : elements){ //当该区域下总数大于3000时获取更小区域的url if(Integer.valueOf(e1.text())>=3000){ Element e2 = document.getElementsByAttributevalue("data-role","ershoufang").select("div").get(2); for(Element element : e2.select("a")){ allUrl.add(url.substring(0,url.indexOf(".com"))+".com/"+element.attr("href")); } }else{ allUrl.add(main); } } } catch (IOException ioException) { ioException.printStackTrace(); } }); } if(!CollectionUtils.isEmpty(allUrl)){ List allUrlNew = allUrl.stream().distinct().collect(Collectors.toList()); return allUrlNew; } return allUrl; }
现在得到所有的有数据的子链接,接下来就是分页了:
//获取url下共多少页
public static int getLianJiaTotalPage(String url) throws IOException {
document doc = JsoupUtil.getDocByProxyTwo(url);
Elements ele = doc.getElementsByClass("page-box house-lst-page-box");
for(Element e : ele){
JSonObject jsonObject = JSONObject.parseObject(e.attr("page-data"));
return (int) jsonObject.get("totalPage");
}
return 0;
}
以上可以得到链接下共有多少页数据,根据页数整理所有url(缩略片段代码):
Listlist = JsoupUtil.getLianJiaAllUrl(url); if (!CollectionUtils.isEmpty(list)) { List pgList = Collections.synchronizedList(new ArrayList<>()); list.parallelStream().forEach(e -> { try { int totalPage = JsoupUtil.getLianJiaTotalPage(e); if (totalPage != 0) { synchronized (pgList){ pgList.add(e); for (int i = 1; i < totalPage + 1; i++) { String newUrl = e + "pg" + i + "/"; pgList.add(newUrl); } } } } catch (IOException ioException) { ioException.printStackTrace(); } }); System.out.println(pgList.size()); }
得到所有链接就可以开始抓取数据了,数据表结构如下,分为二手房简介数据表,详情数据表,小区表:
CREATE TABLE `es_house_beijing` ( `id` int(11) NOT NULL AUTO_INCREMENT, `city` varchar(15) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '城市', `area` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '区域', `build_name` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '小区名称', `build_type` varchar(15) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '建筑类型', `build_renovation` varchar(10) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '装修情况', `build_time` int(11) DEFAULT NULL COMMENT '建造时间', `house_floor` varchar(20) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '楼层', `house_label` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '标签', `house_type` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '户型', `house_type_tag` varchar(5) COLLATE utf8mb4_bin DEFAULT NULL, `house_orientation` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '朝向', `all_amount` decimal(10,2) DEFAULT NULL COMMENT '总价万元', `amount_tag` varchar(5) COLLATE utf8mb4_bin DEFAULT NULL, `area_count` decimal(10,2) DEFAULT NULL COMMENT '面积', `area_count_tag` varchar(5) COLLATE utf8mb4_bin DEFAULT NULL, `unit_price` int(11) DEFAULT NULL COMMENT '单价元/平', `follow_num` int(11) DEFAULT NULL COMMENT '关注数', `publish_time` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '发布时间', `detail_url` varchar(128) COLLATE utf8mb4_bin DEFAULT NULL, `area_id` int(11) DEFAULT NULL, `create_time` datetime DEFAULT NULL, PRIMARY KEY (`id`), UNIQUE KEY `url` (`detail_url`), KEY `search` (`city`,`build_name`,`house_orientation`,`house_floor`,`build_time`,`build_renovation`,`build_type`,`house_label`,`area_count_tag`,`amount_tag`,`house_type_tag`) ) ENGINE=InnoDB AUTO_INCREMENT=384947 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin COMMENT='北京二手房简介信息'
CREATE TABLE `es_house_detail_beijing` ( `id` int(11) NOT NULL AUTO_INCREMENT, `house_id` int(11) DEFAULT NULL COMMENT 'es_house表id', `area` varchar(68) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '区域', `house_type` varchar(20) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '户型', `house_type_img` varchar(128) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '户型图', `house_structure` varchar(15) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '户型结构', `house_area` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '套内面积', `build_structure` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '建筑结构', `elevator_rate` varchar(15) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '梯户比例', `heating_mode` varchar(15) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '供暖方式', `is_elevator` varchar(15) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '配备电梯', `listing_time` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '挂牌时间', `transaction_ownership` varchar(15) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '交易权属', `last_transaction` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '上次交易', `house_purpose` varchar(15) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '房屋用途', `house_years` varchar(15) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '房屋年限', `property_ownership` varchar(15) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '产权所属', `mortgage_mess` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '抵押信息', `certificate` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '房本备件', `house_label` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '房源标签', `selling_point` varchar(2048) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '核心卖点', `build_introduce` varchar(512) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '小区介绍', `build_periphery` varchar(512) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '周边配套', `build_traffic` varchar(512) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '交通出行', `commended_user` varchar(512) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '适宜人群', `house_type_introduce` varchar(512) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '户型介绍', `sale_detail` varchar(512) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '售房详情', `house_type_detail` text COLLATE utf8mb4_bin COMMENT '户型分间', `create_time` datetime DEFAULT NULL, PRIMARY KEY (`id`), UNIQUE KEY `house` (`house_id`), KEY `search` (`house_id`,`area`,`house_purpose`,`heating_mode`,`transaction_ownership`,`listing_time`) ) ENGINE=InnoDB AUTO_INCREMENT=104116 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin COMMENT='二手房详细信息'
CREATE TABLE `es_house_areas` ( `id` int(11) NOT NULL AUTO_INCREMENT, `city` varchar(15) COLLATE utf8mb4_bin DEFAULT NULL, `area` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL, `name` varchar(128) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '小区名称', `build_years` varchar(20) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '建筑年代', `build_type` varchar(20) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '建筑类型', `build_count` int(11) DEFAULT NULL COMMENT '楼栋总数/栋', `house_type_count` int(11) DEFAULT NULL COMMENT '房屋总数/户', `listing_house` int(11) DEFAULT NULL COMMENT '挂牌房源', `lease_house` int(11) DEFAULT NULL COMMENT '出租数量', `deal_num` int(11) DEFAULT NULL COMMENT '90天内成交数量', `estate_price` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '物业费用', `estate_company` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '物业公司', `developers` varchar(45) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '开发商', `house_num` int(11) DEFAULT NULL COMMENT '房屋总数', `average_price` varchar(20) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '均价', `area_detail` varchar(256) COLLATE utf8mb4_bin DEFAULT NULL COMMENT '详细位置', `longitude` decimal(10,7) DEFAULT NULL COMMENT '经度', `latitude` decimal(10,7) DEFAULT NULL COMMENT '纬度', PRIMARY KEY (`id`), KEY `area` (`area`,`city`) ) ENGINE=InnoDB AUTO_INCREMENT=27451 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin COMMENT='小区表'
根据url进行数据抓取(这一块因为时间紧迫写的有点草率,还可以优化优化):
//二手房列表数据 public static ListcrawlListData(String url,String city){ try { List list = new ArrayList<>(); document doc = JsoupUtil.getDocByProxyTwo(url); // document doc = Jsoup.connect(url).get(); Elements ele = doc.getElementsByClass("sellListContent"); Elements ele2 = ele.select("li"); for(Element e : ele2){ EsHouse esHouse = new EsHouse(); esHouse.setCity(city); Elements es = e.getElementsByClass("positionInfo"); for(Element e1 : es){ esHouse.setBuildName(e1.select("a").text().split(" ")[0]); esHouse.setArea(e1.select("a").text().split(" ")[1]); } for(String s : e.getElementsByClass("houseInfo").text().split(" [|] ")){ if(s.contains("室") && s.contains("厅")){ esHouse.setHouseType(s); if(esHouse.getHouseType()!=null){ if(esHouse.getHouseType().substring(0,1).equals("1")){ esHouse.setHouseTypeTag("h1"); }else if(esHouse.getHouseType().substring(0,1).equals("2")){ esHouse.setHouseTypeTag("h2"); }else if(esHouse.getHouseType().substring(0,1).equals("3")){ esHouse.setHouseTypeTag("h3"); }else if(esHouse.getHouseType().substring(0,1).equals("4")){ esHouse.setHouseTypeTag("h4"); }else if(esHouse.getHouseType().substring(0,1).equals("5")){ esHouse.setHouseTypeTag("h5"); }else if(Integer.valueOf(esHouse.getHouseType().substring(0,1))>5){ esHouse.setHouseTypeTag("h6"); } } }else if(s.contains("米")){ esHouse.setAreaCount(BigDecimal.valueOf(Double.valueOf(s.replaceAll("平米","")))); if(esHouse.getAreaCount().compareTo(BigDecimal.valueOf(50))<1){ esHouse.setAreaCountTag("a1"); }else if(esHouse.getAreaCount().compareTo(BigDecimal.valueOf(50))==1 && esHouse.getAreaCount().compareTo(BigDecimal.valueOf(70))<1){ esHouse.setAreaCountTag("a2"); }else if(esHouse.getAreaCount().compareTo(BigDecimal.valueOf(70))==1 && esHouse.getAreaCount().compareTo(BigDecimal.valueOf(90))<1){ esHouse.setAreaCountTag("a3"); }else if(esHouse.getAreaCount().compareTo(BigDecimal.valueOf(90))==1 && esHouse.getAreaCount().compareTo(BigDecimal.valueOf(120))<1){ esHouse.setAreaCountTag("a4"); }else if(esHouse.getAreaCount().compareTo(BigDecimal.valueOf(120))==1 && esHouse.getAreaCount().compareTo(BigDecimal.valueOf(150))<1){ esHouse.setAreaCountTag("a5"); }else if(esHouse.getAreaCount().compareTo(BigDecimal.valueOf(150))==1 && esHouse.getAreaCount().compareTo(BigDecimal.valueOf(200))<1){ esHouse.setAreaCountTag("a6"); }else if(esHouse.getAreaCount().compareTo(BigDecimal.valueOf(200))==1 && esHouse.getAreaCount().compareTo(BigDecimal.valueOf(300))<1){ esHouse.setAreaCountTag("a7"); }else if(esHouse.getAreaCount().compareTo(BigDecimal.valueOf(300))==1){ esHouse.setAreaCountTag("a8"); } }else if(s.contains("南") || s.contains("北") || s.contains("东") || s.contains("西")){ esHouse.setHouseOrientation(s); }else if(s.contains("装") || s.contains("毛坯") || s.contains("其他")){ esHouse.setBuildRenovation(s); }else if(s.contains("层")){ esHouse.setHouseFloor(s); }else if(s.contains("建")){ if(StringUtil.isNotBlank(s)){ esHouse.setBuildTime(Integer.valueOf(s.replaceAll("年建",""))); }else{ esHouse.setBuildTime(0); } }else if(s.contains("板塔结合") || s.contains("板楼") || s.contains("塔楼")){ esHouse.setBuildType(s); } } for(String s : e.getElementsByClass("followInfo").text().split(" / ")){ if(s.contains("关注")){ esHouse.setFollowNum(Integer.valueOf(s.split("人关注")[0])); }else if(s.contains("发布")){ if(s.contains("天")){ String a = s.split("天以前发布")[0]; esHouse.setPublishTime(TimeUtil.getTimeByDay(Integer.valueOf(a),new Date())); }else if(s.contains("月")){ String a = s.split("个月以前发布")[0]; esHouse.setPublishTime(TimeUtil.getTimeByMonth(Integer.valueOf(a))); }else if(s.contains("一年前")){ String a = String.valueOf(1); esHouse.setPublishTime(TimeUtil.getTimeByYear(Integer.valueOf(a))); } } } Elements es1 = e.getElementsByClass("tag"); for(Element e1 : es1){ esHouse.setHouseLabel(""); if(StringUtil.isNotBlank(e1.getElementsByClass("subway").text())){ esHouse.setHouseLabel(esHouse.getHouseLabel()+e1.getElementsByClass("subway").text()+","); } if (StringUtil.isNotBlank(e1.getElementsByClass("isVrFutureHome").text())) { esHouse.setHouseLabel(esHouse.getHouseLabel()+e1.getElementsByClass("isVrFutureHome").text()+","); } if (StringUtil.isNotBlank(e1.getElementsByClass("taxfree").text())) { esHouse.setHouseLabel(esHouse.getHouseLabel()+e1.getElementsByClass("taxfree").text()+","); } if (StringUtil.isNotBlank(e1.getElementsByClass("haskey").text())) { esHouse.setHouseLabel(esHouse.getHouseLabel()+e1.getElementsByClass("haskey").text()+","); } } esHouse.setUnitPrice(Integer.valueOf(e.getElementsByClass("unitPrice").text().split("元/平")[0].replaceAll(",",""))); esHouse.setAllAmount(BigDecimal.valueOf(Double.valueOf(e.getElementsByClass("totalPrice totalPrice2").text().split("万")[0]))); if(esHouse.getAllAmount()!=null){ if(esHouse.getAllAmount().compareTo(BigDecimal.valueOf(50))<1){ esHouse.setAmountTag("p1"); }else if(esHouse.getAllAmount().compareTo(BigDecimal.valueOf(50))==1 && esHouse.getAllAmount().compareTo(BigDecimal.valueOf(80))<1){ esHouse.setAmountTag("p2"); }else if(esHouse.getAllAmount().compareTo(BigDecimal.valueOf(80))==1 && esHouse.getAllAmount().compareTo(BigDecimal.valueOf(100))<1){ esHouse.setAmountTag("p3"); }else if(esHouse.getAllAmount().compareTo(BigDecimal.valueOf(100))==1 && esHouse.getAllAmount().compareTo(BigDecimal.valueOf(120))<1){ esHouse.setAmountTag("p4"); }else if(esHouse.getAllAmount().compareTo(BigDecimal.valueOf(120))==1 && esHouse.getAllAmount().compareTo(BigDecimal.valueOf(150))<1){ esHouse.setAmountTag("p5"); }else if(esHouse.getAllAmount().compareTo(BigDecimal.valueOf(150))==1 && esHouse.getAllAmount().compareTo(BigDecimal.valueOf(200))<1){ esHouse.setAmountTag("p6"); }else if(esHouse.getAllAmount().compareTo(BigDecimal.valueOf(200))==1 && esHouse.getAllAmount().compareTo(BigDecimal.valueOf(300))<1){ esHouse.setAmountTag("p7"); }else if(esHouse.getAllAmount().compareTo(BigDecimal.valueOf(300))==1){ esHouse.setAmountTag("p8"); } } esHouse.setDetailUrl(e.getElementsByClass("title").select("a").attr("href")); list.add(esHouse); } return list; }catch (Exception e){ e.printStackTrace(); } return null; }
二手房详情数据(忽略我草率的代码):
//二手房详情数据
public static EsHouseDetail getHouseDetail(String url){
try {
EsHouseDetail ehd = new EsHouseDetail();
document doc = JsoupUtil.getDocByProxyTwo(url);
// document doc = Jsoup.connect(url).get();
Elements ele = doc.getElementsByClass("areaName").select("a");
Elements area = doc.getElementsByClass("communityName");
for(Element element : area){
ehd.setAreaUrl(url.substring(0,url.indexOf(".com"))+".com/"+element.getElementsByClass("info ").attr("href"));
}
ehd.setArea("");
for(Element e : ele){
ehd.setArea(ehd.getArea()+e.text()+"|");
}
Elements elements = doc.getElementsByClass("price").select("span");
for(Element element: elements){
ehd.setPrice(element.text());
break;
}
Elements ele1 = doc.getElementsByClass("introContent");
for(Element e : ele1){
Elements ele2 = e.getElementsByClass("base").select("li");
for(Element e1 : ele2){
if(e1.getElementsByTag("span").text().equals("房屋户型")){
ehd.setHouseType(e1.text().replaceAll(e1.getElementsByTag("span").text(),""));
}else if(e1.getElementsByTag("span").text().equals("户型结构")){
ehd.setHouseStructure(e1.text().replaceAll(e1.getElementsByTag("span").text(),""));
}else if(e1.getElementsByTag("span").text().equals("套内面积")){
ehd.setHouseArea(e1.text().replaceAll(e1.getElementsByTag("span").text(),""));
}else if(e1.getElementsByTag("span").text().equals("建筑结构")){
ehd.setBuildStructure(e1.text().replaceAll(e1.getElementsByTag("span").text(),""));
}else if(e1.getElementsByTag("span").text().equals("梯户比例")){
ehd.setElevatorRate(e1.text().replaceAll(e1.getElementsByTag("span").text(),""));
}else if(e1.getElementsByTag("span").text().equals("供暖方式")){
ehd.setHeatingMode(e1.text().replaceAll(e1.getElementsByTag("span").text(),""));
}else if(e1.getElementsByTag("span").text().equals("配备电梯")){
ehd.setIsElevator(e1.text().replaceAll(e1.getElementsByTag("span").text(),""));
}
}
Elements ele3 = e.getElementsByClass("transaction").select("li");
for(Element e2 : ele3){
if(e2.getElementsByTag("span").text().contains("挂牌时间")){
ehd.setListingTime(e2.getElementsByTag("span").text().replaceAll("挂牌时间 ",""));
}else if(e2.getElementsByTag("span").text().contains("交易权属")){
ehd.setTransactionOwnership(e2.getElementsByTag("span").text().replaceAll("交易权属 ",""));
}else if(e2.getElementsByTag("span").text().contains("上次交易")){
ehd.setLastTransaction(e2.getElementsByTag("span").text().replaceAll("上次交易 ",""));
}else if(e2.getElementsByTag("span").text().contains("房屋用途")){
ehd.setHousePurpose(e2.getElementsByTag("span").text().replaceAll("房屋用途 ",""));
}else if(e2.getElementsByTag("span").text().contains("房屋年限")){
ehd.setHouseYears(e2.getElementsByTag("span").text().replaceAll("房屋年限 ",""));
}else if(e2.getElementsByTag("span").text().contains("产权所属")){
ehd.setPropertyOwnership(e2.getElementsByTag("span").text().replaceAll("产权所属 ",""));
}else if(e2.getElementsByTag("span").text().contains("抵押信息")){
ehd.setMortgageMess(e2.getElementsByTag("span").text().replaceAll("抵押信息 ",""));
}else if(e2.getElementsByTag("span").text().contains("房本备件")){
ehd.setCertificate(e2.getElementsByTag("span").text().replaceAll("房本备件 ",""));
}
}
}
Elements ele2 = doc.getElementsByClass("introContent showbasemore");
for(Element e : ele2){
// System.out.println(e.getElementsByClass("tags clear").text());
if(e.getElementsByClass("tags clear").text().contains("房源标签")){
ehd.setHouseLabel(e.getElementsByClass("tags clear").text().replaceAll("房源标签 ","").replaceAll(" ",","));
}
Elements ele3 = e.getElementsByClass("baseattribute clear");
for(Element e1 : ele3){
if(e1.text().contains("核心卖点")){
ehd.setSellingPoint(e1.text().replaceAll("核心卖点 ",""));
}else if(e1.text().contains("户型介绍")){
ehd.setHouseTypeIntroduce(e1.text().replaceAll("户型介绍 ",""));
}else if(e1.text().contains("周边配套")){
ehd.setBuildPeriphery(e1.text().replaceAll("周边配套 ",""));
}else if(e1.text().contains("交通出行")){
ehd.setBuildTraffic(e1.text().replaceAll("交通出行 ",""));
}else if(e1.text().contains("适宜人群")){
ehd.setCommendedUser(e1.text().replaceAll("适宜人群 ",""));
}else if(e1.text().contains("售房详情")){
ehd.setSaleDetail(e1.text().replaceAll("售房详情 ",""));
}else if(e1.text().contains("小区介绍")){
ehd.setBuildIntroduce(e1.text().replaceAll("小区介绍 ",""));
}
}
}
List list = new ArrayList<>();
Element e = doc.getElementById("infoList");
if(e!=null){
Elements ele3 = e.getElementsByClass("row");
ehd.setHouseTypeImg(doc.getElementsByClass("imgdiv").select("img").attr("src"));
for(Element e1 : ele3){
Elements ele4 = e1.getElementsByClass("col");
EsHouseTypeVo etv = new EsHouseTypeVo();
int a = 0;
for(Element e2 : ele4){
if(a==0){
etv.setName(e2.text());
a++;
}else if(a==1){
etv.setArea(e2.text());
a++;
}else if(a==2){
etv.setOrientation(e2.text());
a++;
}else if(a==3){
etv.setType(e2.text());
a++;
}
}
list.add(etv);
}
//序列化存储
String s = SerializeUtil.OutputStream(list);
ehd.setHouseTypeDetail(s);
}
// Elements ele4 = doc.getElementsByClass("xiaoqu_info");
// for(Element e1 : ele4){
// System.out.println(e1.text());
// }
return ehd;
}catch (Exception e){
}
return null;
}
小区数据:
public static ListgetHouseAreasList(String url,String city){ try { List list = new ArrayList<>(); document doc = JsoupUtil.getDocByProxyTwo(url); // document doc = Jsoup.connect(url).get(); Elements ele = doc.getElementsByClass("listContent").select("li"); System.out.println(ele.text()); String detailUrl = ""; for(Element e : ele){ EsHouseAreas eha = new EsHouseAreas(); eha.setListingHouse(Integer.valueOf(e.getElementsByClass("xiaoquListItemSellCount").select("span").text())); Elements ele1 = e.getElementsByClass("info"); for(Element e1 : ele1){ detailUrl = e1.getElementsByClass("title").select("a").attr("href"); if(e1.getElementsByClass("houseInfo").select("a").get(0).text().contains("户型")){ eha.setHouseTypeCount(Integer.valueOf(e1.getElementsByClass("houseInfo").select("a").get(0).text().replaceAll("共","").replaceAll("个户型",""))); }else{ eha.setDealNum(Integer.valueOf(e1.getElementsByClass("houseInfo").select("a").get(0).text().replaceAll("90天成交","").replaceAll("套","").replaceAll("30天成交",""))); } if(e1.getElementsByClass("houseInfo").select("a").get(1).text().contains("出租")){ eha.setLeaseHouse(Integer.valueOf(e1.getElementsByClass("houseInfo").select("a").get(1).text().replaceAll("套正在出租",""))); }else if(e1.getElementsByClass("houseInfo").select("a").get(1).text().contains("成交")){ eha.setDealNum(Integer.valueOf(e1.getElementsByClass("houseInfo").select("a").get(1).text().replaceAll("90天成交","").replaceAll("套","").replaceAll("30天成交",""))); }else if(e1.getElementsByClass("houseInfo").select("a").get(0).text().contains("户型")){ eha.setHouseTypeCount(Integer.valueOf(e1.getElementsByClass("houseInfo").select("a").get(1).text().replaceAll("共","").replaceAll("个户型",""))); } if(e1.getElementsByClass("houseInfo").select("a").size()>2 && e1.getElementsByClass("houseInfo").select("a").get(1).text().contains("出租")){ eha.setLeaseHouse(Integer.valueOf(e1.getElementsByClass("houseInfo").select("a").get(1).text().replaceAll("套正在出租",""))); } eha.setArea(e1.getElementsByClass("positionInfo").select("a").get(0).text()); } eha.setCity(city); document doc1 = JsoupUtil.getDocByProxyTwo(detailUrl); // document doc1 = Jsoup.connect(detailUrl).get(); Elements ele2 = doc1.getElementsByClass("xiaoquInfo"); Elements elements = doc1.getElementsByClass("detailTitle"); eha.setName(elements.text()); for(Element e1 : ele2) { Elements ele3 = e1.getElementsByClass("xiaoquInfoItem"); for (Element e2 : ele3) { if(e2.text().contains("物业费用")){ eha.setEstatePrice(e2.text().replaceAll("物业费用","")); }else if(e2.text().contains("物业公司")){ eha.setEstateCompany(e2.text().replaceAll("物业公司","")); }else if(e2.text().contains("开发商")){ eha.setDevelopers(e2.text().replaceAll("开发商","")); }else if(e2.text().contains("楼栋总数")){ eha.setBuildCount(Integer.valueOf(e2.text().replaceAll("楼栋总数","").replaceAll("栋",""))); }else if(e2.text().contains("房屋总数")){ eha.setHouseNum(Integer.valueOf(e2.text().replaceAll("房屋总数","").replaceAll("户",""))); }else if(e2.text().contains("建筑年代")){ eha.setBuildYears(e2.text().replaceAll("建筑年代","")); }else if(e2.text().contains("建筑类型")){ eha.setBuildType(e2.text().replaceAll("建筑类型","")); } } } list.add(eha); } return list; }catch (Exception e){ e.printStackTrace(); } return null; }
代理方法(如果不用代理可以去掉代理的方式,直接Jsoup.connect().get()):
public static document getDocByProxyTwo(String href) throws IOException {
document doc = null;
List ipList = "放代理用的ip";
String ipstr = ipList.get(new Random().nextInt(ipList.size()));
String ip = ipstr.split(":")[0];
int port = Integer.parseInt(ipstr.split(":")[1]);
try {
HttpHost proxy = new HttpHost(ip, port, "http");
CredentialsProvider provider = new BasicCredentialsProvider();
provider.setCredentials(new AuthScope(proxy), new UsernamePasswordCredentials("hpc11", "5768"));
CloseableHttpClient httpClient = HttpClients.custom().setDefaultCredentialsProvider(provider).build();
RequestConfig requestConfig = RequestConfig.custom().setProxy(proxy).build();
CloseableHttpResponse response = null;
HttpGet get = new HttpGet(href);
get.setConfig(requestConfig);
response = httpClient.execute(get);
InputStream inputStream = response.getEntity().getContent();
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
StringBuffer bs = new StringBuffer();
String line;
while ((line = reader.readLine()) != null) {
bs.append(line);
}
doc = Jsoup.parse(bs.toString());
reader.close();
inputStream.close();
return doc;
} catch (Exception e) {
e.printStackTrace();
}
System.out.println("ip+++++++++++++++++++++++++++++++++"+ip+":"+port);
return null;
}
完结
暂时就不放所有代码了,如果还有不会的可以私聊。也请大佬们多提提修改意见。



