在工作中,经常做一些有关地区、地址的需求,就是在网页或者App端,展示三级下拉选择省市区。本文旨在帮助我们从国家统计局获取最新的省市区数据用户项目中。
以下代码支持爬虫省市区镇街道,设置有2个全局变量,默认只爬取省市区保存到本地,然后从本地读取爬虫html网页解析成json对象,也可以转成Excel,自己找插件或者写代码转就可以了,非常方便。
首先说说这个爬虫的几个注意顶
- 因为爬虫需要多次与远端服务器连接,并发连接会遇到以下错误,跟代码没有关系,跟网络有关系,连接的时候多次重定向导致,没花过多时间研究解决办法,目前等过一会再次执行就可以了。有好的解决方法请在评论区评论一下,感激。
java.io.IOException: Too many redirects occurred trying to load URL
2.如果在读本地html网页文件的时候报错,FileNotFoundExceiption,说命爬下来的网页文件不完整,缺失了一部分,那么可以将根目录下的region文件夹删除,下次启动程序,会再次自从从远端爬取html网页文件写入到本地。
3.如果在日志中遇到,连接失败,正在重试。就是遇到了上边 说的 发生了多次重定向问题,这样写入到本地的网页文件可能不完整,如果再次启动程序,成功转出json,需要开发者自行检查转出的json数据是否完整。
package com.lockie.region;
import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.lockie.region.entity.City;
import com.lockie.region.entity.County;
import com.lockie.region.entity.Province;
import com.lockie.region.entity.Street;
import com.lockie.region.entity.Town;
import com.lockie.region.enums.AreaLevelEnum;
import com.lockie.region.enums.OperationType;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.linkedBlockingQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.ThreadPoolExecutor.AbortPolicy;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.HttpStatus;
import org.jsoup.Connection;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.helper.HttpConnection;
import org.jsoup.nodes.document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.helpers.MessageFormatter;
@Slf4j
public class RegionTask {
public static final String base_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/";
public static final String AREA_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html";
private static Boolean isRetry = false;
private static final Integer TIMEOUT = 500000;
private static final String REGION = "./region/";
private static Integer COUNT = 0;
private static final File file;
private static final File flagFile;
public static final Gson gson = new GsonBuilder().create();
public static final String HTML = ".html";
public static final Boolean IS_GET_STREET = false;
public static final Boolean IS_GET_TOEN = false;
public static final String A = "a";
public static final String TDA = "td a";
public static final String HREF = "href";
public static final String TEMPLATE = "table.{}table tbody tr.{}tr ";
public static final String TD = "td";
public static final String FLAG = "region.txt";
public static final CountDownLatch write = new CountDownLatch(1);
static {
file = new File(REGION);
flagFile = new File(REGION.concat(FLAG));
try {
if(!file.exists()){
file.mkdirs();
}
if(!file.exists()){
flagFile.createNewFile();
}
} catch (IOException e) {
e.printStackTrace();
}
}
public static ThreadFactory threadFactory = new ThreadFactoryBuilder().setNameFormat("thread-pool-%d --- ").build();
public static ExecutorService threadPool = new ThreadPoolExecutor(50, 200, 0L, TimeUnit.SECONDS,
new linkedBlockingQueue<>(30), threadFactory,new AbortPolicy());
private static Connection getConnection(String u) {
Connection connection = Jsoup.connect(u).timeout(TIMEOUT);
connection.header(HttpConnection.CONTENT_ENCODING, Charsets.UTF_8.name());
connection.header("Accept", "*
private static document getdocument(String url) {
return Optional.ofNullable(url).map(u -> {
try {
Connection connection = getConnection(u);
Response execute = null;
do {
execute = connection.execute();
if (isRetry) {
Thread.sleep(2000);
connection = getConnection(u);
}
} while (execute.statusCode() != HttpStatus.SC_OK);
return connection.post();
} catch (Exception e) {
log.info("无法链接,正在重试~");
}
return null;
}).orElse(null);
}
public static void writeToLocal(Element province,CyclicBarrier cyclicBarrier) {
try {
Elements proSelect = province.select(TDA);
String proname = proSelect.text();
if(StringUtils.isNotEmpty(proName)) {
String cityUrl = base_URL.concat(proSelect.attr(HREF));
document citydocument = null;
do {
citydocument = getdocument(cityUrl);
if (citydocument == null) {
isRetry = true;
} else {
isRetry = false;
}
} while (null == citydocument);
Elements cities = getCities(citydocument, proName, proSelect.attr(HREF), OperationType.WRITE);
for (Element city : cities) {
Elements citySelect = city.select(TDA);
if (citySelect.size() > 0) {
Element cityCodeElement = citySelect.get(0);
Element cityNameElement = citySelect.get(1);
String cityName = cityNameElement.text();
String countyUrl = cityCodeElement.absUrl(HREF);
String fileName = countyUrl.split(base_URL)[1];
document countydocument = null;
do {
countydocument = getdocument(countyUrl);
if (countydocument == null) {
isRetry = true;
} else {
isRetry = false;
}
} while (null == countydocument);
Elements counties = getCounties(countydocument, proName, cityName, fileName,OperationType.WRITE);
if (IS_GET_TOEN) {
for (Element county : counties) {
Elements countySelect = county.select(TDA);
if (countySelect.size() > 0) {
Element countyCodeElement = countySelect.get(0);
Element countyNameElement = countySelect.get(1);
String countyName = countyNameElement.text();
String townUrl = countyCodeElement.absUrl(HREF);
String countyFileName = townUrl.split(base_URL)[1];
document towndocument = null;
do {
towndocument = getdocument(townUrl);
if (towndocument == null) {
isRetry = true;
} else {
isRetry = false;
}
} while (null == towndocument);
Elements towns = getTowns(towndocument, proName, cityName, countyName,
countyFileName,OperationType.WRITE);
if (IS_GET_STREET) {
for (Element town : towns) {
Elements townSelect = town.select(TDA);
if (townSelect.size() > 0) {
Element townCodeElement = townSelect.get(0);
Element townNameElement = townSelect.get(1);
String townName = townNameElement.text();
String streetUrl = townCodeElement.absUrl(HREF);
String townFileName = streetUrl.split(base_URL)[1];
document streetdocument = null;
do {
streetdocument = getdocument(streetUrl);
if (streetdocument == null) {
isRetry = true;
} else {
isRetry = false;
}
} while (null == streetdocument);
getStreets(streetdocument, proName, cityName, countyName,
townName, townFileName,OperationType.WRITE);
}
}
}
}
}
}
}
}
cyclicBarrier.await();
log.info("{} 所有区数据爬取完毕。", proName);
}
} catch (Exception exception) {
exception.printStackTrace();
log.error("Write error,error = {}", exception);
}
}
public static Elements getProvinces(document provincedocument,OperationType operationType) throws IOException {
return getElements(provincedocument, AreaLevelEnum.PROVINCE,null,null,null,null,
AreaLevelEnum.PROVINCE.getLevel().concat(HTML),operationType);
}
public static Elements getCities(document citydocument,String proName,String fileName,OperationType operationType) throws IOException {
return getElements(citydocument,AreaLevelEnum.CITY,proName,null,null,null,fileName,operationType);
}
public static Elements getCounties(document countydocument,String proName,String cityName,String fileName,OperationType operationType) throws IOException {
return getElements(countydocument,AreaLevelEnum.COUNTY,proName,cityName,null,null,fileName,operationType);
}
public static Elements getTowns(document towndocument,String proName,String cityName,
String countyName,String fileName,OperationType operationType) throws IOException {
return getElements(towndocument,AreaLevelEnum.TOWN,proName,cityName,countyName,null,fileName,operationType);
}
public static Elements getStreets(document streetdocument,String proName,String cityName,
String countyName,String townName,String fileName,OperationType operationType) throws IOException {
return getElements(streetdocument, AreaLevelEnum.VILLAGE,proName, cityName, countyName,townName,fileName,operationType);
}
private static Elements getElements(document document,AreaLevelEnum level,String proName, String cityName,
String countyName,
String townName,
String fileName,
OperationType operationType) throws IOException {
try {
String le = level.getLevel();
Elements elements = null;
if(null != document) {
if(AreaLevelEnum.PROVINCE == level || AreaLevelEnum.VILLAGE == level){
elements = document.select(MessageFormatter.format(TEMPLATE, le, le).getMessage().concat(TD));
}else if(AreaLevelEnum.VILLAGE != level){
elements = document.select(MessageFormatter.format(TEMPLATE, le, le).getMessage());
}
if(OperationType.WRITE == operationType){
File file = new File(REGION.concat(fileName));
File parentFile = file.getParentFile();
if(!parentFile.exists()){
parentFile.mkdirs();
}
if(!file.exists()){
file.createNewFile();
}
FileUtils.writeStringToFile(file,document.html(), "gb2312");
String path = file.getAbsolutePath();
log.info("<---------------------------------->");
if(StringUtils.isNotEmpty(townName)){
log.info("{}-{}-{}-{} 所有街道/村 网页数据写入到本地完成~~,文件所在地址->{}",proName,cityName,countyName,townName, path);
}else if(StringUtils.isNotEmpty(countyName)){
log.info("{}-{}-{} 所有镇 网页数据写入到本地完成~~,文件所在地址->{}",proName,cityName,countyName,path);
}else if(StringUtils.isNotEmpty(cityName)){
log.info("{}-{} 所有区/县 网页数据写入到本地完成~~,文件所在地址->{}",proName,cityName,path);
}else if(StringUtils.isNotEmpty(proName)){
log.info("{} 所有市 网页数据写入到本地完成~~,文件所在地址->{}",proName,path);
}else {
log.info("所有省 网页数据写入到本地完成~~,文件所在地址:{}",path);
}
log.info("<---------------------------------->");
}
}
return elements;
} catch (Exception e) {
e.printStackTrace();
log.error("GetAndWriteProvince method error,error = {}",e);
throw e;
}
}
private static class SyncWrite implements Runnable {
private Element element;
private CyclicBarrier cyclicBarrier;
public SyncWrite(Element element,CyclicBarrier cyclicBarrier){
this.element = element;
this.cyclicBarrier = cyclicBarrier;
}
@Override
public void run() {
writeToLocal(element,cyclicBarrier);
}
}
public static void write(){
if(file.list().length > 1){
log.info("所有省市区数据已经写入到本地完毕,若要更新所有省市区文件,请删除该文件夹即可,文件地址:{}",file.getAbsolutePath());
write.countDown();
return;
}
document document = getdocument(AREA_URL);
System.out.println("等待所有 省份数据 爬取 完毕 !!!!!!!");
try{
Elements provinces = getProvinces(document,OperationType.WRITE);
if(null != provinces && provinces.size() > 0){
CyclicBarrier cyclicBarrier = new CyclicBarrier(provinces.size()-1, new Runnable() {
@Override
public void run() {
log.info("===================== 所有省数据写入完毕 ========================");
write.countDown();
}
});
for(Element province : provinces){
threadPool.submit(new SyncWrite(province,cyclicBarrier));
}
}
}catch(Exception e){
e.printStackTrace();
}
}
private static document readStringTodocument(String fileAddr){
try{
return Jsoup.parse(FileUtils.readFileToString(new File(fileAddr),"gb2312"));
}catch(Exception e){
e.printStackTrace();
log.error("Read string to document error,error = {}",e);
}
return null;
}
private static class Area implements Runnable {
private Boolean isGetTown;
private Boolean isGetStreet;
private String url;
private Province province;
private CyclicBarrier cyclicBarrier;
private List provincesList;
public Area(Boolean isGetTown, Boolean isGetStreet, String url, Province province,CyclicBarrier cyclicBarrier
,List provincesList) {
this.isGetTown = isGetTown;
this.isGetStreet = isGetStreet;
this.url = url;
this.province = province;
this.cyclicBarrier = cyclicBarrier;
this.provincesList = provincesList;
}
@SneakyThrows
@Override
public void run() {
Province province = this.province;
String cityUrl = url.concat(province.getProvinceCode().concat(HTML));
document document = readStringTodocument(cityUrl);
String cityLevel = AreaLevelEnum.CITY.getLevel();
Elements cityElements = document
.select(MessageFormatter.format(TEMPLATE, cityLevel, cityLevel).getMessage());
List cities = Optional.ofNullable(cityElements).filter(a -> a.size() > 0).map(u -> {
return u.stream().map(k -> {
return Optional.ofNullable(k.select(TDA)).filter(a -> a.size() > 0).map(a -> {
Element codeElement = a.get(0);
Element nameElement = a.get(1);
City city = new City();
city.setCityCode(codeElement.text().substring(0,6));
city.setCityName(nameElement.text());
city.setLevel(AreaLevelEnum.CITY.getLevel());
String countyUrl = url.concat(codeElement.attr(HREF));
document countydocument = readStringTodocument(countyUrl);
String countLevel = AreaLevelEnum.COUNTY.getLevel();
Elements countyElements = countydocument
.select(MessageFormatter.format(TEMPLATE, countLevel, countLevel).getMessage());
List counties = Optional.ofNullable(countyElements).filter(c -> c.size() > 0).map(x -> {
return x.stream().map(c -> {
return Optional.ofNullable(c.select(TDA)).filter(aq -> aq.size() > 0).map(aq -> {
Element countyCode = aq.get(0);
Element countyName = aq.get(1);
County county = new County();
county.setCountyCode(countyCode.text().substring(0,6));
county.setCountyName(countyName.text());
county.setLevel(AreaLevelEnum.COUNTY.getLevel());
String townUrl = url.concat(countyCode.attr(HREF));
if (isGetTown) {
document towndocument = readStringTodocument(townUrl);
String townLevel = AreaLevelEnum.TOWN.getLevel();
Elements townElements = towndocument.select(
MessageFormatter.format(TEMPLATE, townLevel, townLevel).getMessage());
List towns = Optional.ofNullable(townElements).filter(w -> w.size() > 0)
.map(e -> {
return e.stream().map(r -> {
return Optional.ofNullable(e.select(TDA))
.filter(qq -> qq.size() > 0).map(qq -> {
Element townCode = qq.get(0);
Element townName = qq.get(1);
Town town = new Town();
town.setLevel(AreaLevelEnum.TOWN.getLevel());
town.setTownCode(townCode.text());
town.setTownName(townName.text());
if (isGetStreet) {
String streetUrl = url.concat(townCode.attr(HREF));
document streetdocument = readStringTodocument(streetUrl);
String streetLevel = AreaLevelEnum.VILLAGE.getLevel();
Elements streetElements = streetdocument.select(
MessageFormatter
.format(TEMPLATE, streetLevel, streetLevel)
.getMessage());
List streets = Optional
.ofNullable(streetElements)
.filter(t -> t.size() > 0).map(t -> {
return t.stream().map(v -> {
return Optional.ofNullable(v.select(TD))
.filter(we -> we.size() > 0).map(we -> {
Element streetCode = we.get(0);
Element streetTypeCode = we.get(1);
Element streetName = we.get(2);
Street street = new Street();
street.setLevel(
AreaLevelEnum.VILLAGE
.getLevel());
street.setStreetCode(
streetCode.text());
street.setStreetTypeCode(
streetTypeCode.text());
street.setStreetName(
streetName.text());
return street;
}).orElse(null);
}).filter(rs -> null != rs)
.collect(Collectors.toList());
}).orElse(null);
town.setStreets(streets);
}
return town;
}).orElse(null);
}).filter(r -> null != r).collect(Collectors.toList());
}).orElse(null);
county.setTowns(towns);
}
return county;
}).orElse(null);
}).filter(r -> null != r).collect(Collectors.toList());
}).orElse(null);
city.setCounties(counties);
return city;
}).orElse(null);
}).filter(r -> null != r).collect(Collectors.toList());
}).orElse(null);
province.setCities(cities);
System.out.println(province.getProvinceName() + ":" + gson.toJson(province));
provincesList.add(province);
cyclicBarrier.await();
}
}
public static void main(String[] args) {
try {
write();
write.await();
String proLevel = AreaLevelEnum.PROVINCE.getLevel();
document prodocument = readStringTodocument(REGION.concat(File.separator).concat(proLevel).concat(HTML));
List provincesList = Lists.newlinkedList();
Elements provinces = getProvinces(prodocument, OperationType.READ);
CyclicBarrier barrier = new CyclicBarrier(provinces.size() - 1, new Runnable() {
@Override
public void run() {
log.info("所有省份json数据组装完毕!!!!");
Collections.sort(provincesList, Comparator.comparing(Province::getProvinceCode));
System.out.println("<---------- 执行结果开始 --------->");
System.out.println(gson.toJson(provincesList));
System.out.println("<---------- 执行结果结束 --------->");
threadPool.shutdown();
}
});
Optional.ofNullable(provinces).filter(a -> a.size() > 0).ifPresent(cs -> {
cs.stream().forEach(element -> {
Elements a = element.select(A);
if (StringUtils.isNotEmpty(a.text())) {
Province province = new Province();
province.setLevel(AreaLevelEnum.PROVINCE.getLevel());
String code = a.attr(HREF).trim().substring(0, 2);
province.setProvinceCode(code);
String name = a.text();
province.setProvinceName(name);
try {
threadPool.execute(new Area(IS_GET_TOEN, IS_GET_STREET, REGION, province,barrier,provincesList));
} catch (Exception e) {
e.printStackTrace();
}
}
});
});
} catch (Exception e) {
e.printStackTrace();
log.error("Occur error,error = {}",e);
}
}
}
最终结果:
项目和转出的json文件等审核完毕,我会发布到CSDN,勿催。
省市区Json文件地址:点击下载2020年国家省市区Json文件
爬虫项目地址:点击下载项目



