目的:从hbase的一个库里面读取,然后进行修改后,在放到一个新库中,新库叫做2_lib(可以自己ctrl+F搜)
一次读取1w1个rowkey,中间开启了500个线程,每个线程插入20个rowkey,每个rowkey里面又有x个版本cell。(1个row中有20w个cell)
我选择了4台机器,根据rowkey会自动排序,我的rowkey是md5,分成了16个大批,每台机器负责4个批次。总共是2000个线程,插入速度达到300万/s。千亿数据,4台机器4天跑完。
再以rowkey的最后1个末尾,作为起始row
有的没的,我都用上了。大家可以自行删减,pom:
4.0.0 org.example insertApi1.0-SNAPSHOT org.apache.maven.plugins maven-compiler-plugin8 8 insertHbaseKu-v1.0 UTF-8 2.6.5 1.7.0 org.apache.hadoop hadoop-common${hadoop.version} org.apache.hadoop hadoop-hdfs${hadoop.version} org.apache.hadoop hadoop-client${hadoop.version} org.apache.hadoop hadoop-mapreduce-client-core${hadoop.version} org.apache.hbase hbase-client${hbase.version} org.apache.hbase hbase-server${hbase.version} dom4j dom4j1.6.1 mysql mysql-connector-java5.1.24 com.alibaba fastjson1.2.31 org.mongodb mongo-java-driver3.4.2 org.apache.httpcomponents httpclient4.5.3 net.sf.json-lib json-lib2.4 jdk15 commons-codec commons-codec1.10 org.apache.commons commons-dbcp22.1.1 org.apache.commons commons-pool22.4.1 log4j log4j1.2.17 org.tukaani xz1.5 net.sf.sevenzipjbinding sevenzipjbinding9.20-2.00beta commons-io commons-io2.4 org.apache.maven maven-artifact3.6.3 org.scala-lang scala-library2.11.8 org.apache.ant ant1.10.5 cn.hutool hutool-all5.1.1 com.github.zafarkhaja java-semver0.9.0 org.slf4j slf4j-api1.7.30 org.junit.jupiter junit-jupiter-api5.7.0-M1 com.moandjiezana.toml toml4j0.7.2 org.yaml snakeyaml1.26
目录结构:
Worker:
import com.alibaba.fastjson.JSONObject;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Random;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
public class Worker implements Runnable
{
private CountDownLatch downLatch;
private Integer name;
private Connection hbaseConnection;
private ArrayList rows;//他负责要处理的rows
public Worker(CountDownLatch downLatch, Integer name, Connection hbaseConnection, ArrayList rows) {
this.downLatch = downLatch;
this.name = name;
this.hbaseConnection = hbaseConnection;
this.rows = rows;
}
@Override
public void run()
{
this.threadInser(hbaseConnection,rows);
// System.out.println(this.name + "活干完了!");
this.downLatch.countDown();
if(downLatch.getCount()==400){
System.out.println("目前还有"+downLatch.getCount()+"个工人没有工作完");
}
if(downLatch.getCount()==300){
System.out.println("目前还有"+downLatch.getCount()+"个工人没有工作完");
}
if(downLatch.getCount()==200){
System.out.println("目前还有"+downLatch.getCount()+"个工人没有工作完");
}
if(downLatch.getCount()==100){
System.out.println("目前还有"+downLatch.getCount()+"个工人没有工作完");
}
}
public static void threadInser(Connection hbaseConnection,ArrayList rowKeys){
try {
Table tableR = hbaseConnection.getTable(TableName.valueOf("0_library_token"));
Table tableW = hbaseConnection.getTable(TableName.valueOf("2_library_token"));
for(String row:rowKeys){
Get get = new Get(Bytes.toBytes(row)).setMaxVersions(1111111)
.addColumn(Bytes.toBytes("F"), Bytes.toBytes("F"));
Cell[] cells = tableR.get(get).rawCells();
for(Cell cell:cells){
String jsonstr = Bytes.toString(CellUtil.clonevalue(cell));
JSonObject jsonObject = JSONObject.parseObject(jsonstr);
String[] hbasePathStrs = jsonObject.getString("path").split("/");
String projectVersion = new StringBuffer().append(hbasePathStrs[1]).append("/").append(hbasePathStrs[3]).toString();//消耗少量内存
Put put = new Put(row.getBytes()); //指定rowkey
put.addColumn("F".getBytes(), "F".getBytes(), projectVersion.getBytes());
tableW.put(put);
}
}
tableR.close();
tableW.close();
} catch (IOException e) {
// e.printStackTrace();
}
}
}
LinuxKudu(主类)
import com.alibaba.fastjson.JSONObject;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.HbaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.IOException;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
public class LinuxKernelKu {
static Connection hbaseConnection;
static String rowStart="0";
public static void main(String[] args) {
//初始化Hbase并且创建600个线程的连接池
//hbase 数据获取
Configuration Hbase_CONF;
Hbase_CONF = HbaseConfiguration.create();
Hbase_CONF.set("hbase.zookeeper.property.clientPort", "2181");
Hbase_CONF.set("hbase.zookeeper.quorum", "192.168.xx.xx");
Hbase_CONF.set("hbase.master", "192.168.xx.xx:60000");
Hbase_CONF.set("zookeeper.znode.parent", "/hbase");
Hbase_CONF.setInt("hbase.hconnection.threads.max", 600);
Hbase_CONF.setInt("hbase.hconnection.threads.core", 600);
Hbase_CONF.setLong("hbase.hconnection.threads.keepalivetime", 1000);
try {
hbaseConnection = ConnectionFactory.createConnection(Hbase_CONF);//初始化连接池
//设置起始row,修改起始row
while (true) {
boolean b = cpNewLibaryToken();
String temp_row=rowStart;
if(b==false){
System.out.println("异常,请删除这个rowStart后的1w行,row为"+rowStart);
break;
}
if(rowStart.substring(0, 1).equals("c")){
System.out.println("我跑完了,跑完的rowStart为"+rowStart);
break;
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
//多线程方法
//Hbase建库(小文件库)
//传入rowStart,hbase连接
//TODO 这个程序的后续版本在E盘的insertApi文件夹中
public static boolean cpNewLibaryToken() {
Instant inst1 = Instant.now();
ExecutorService executor = Executors.newCachedThreadPool();
CountDownLatch dLatch = new CountDownLatch(500);
//rowkey
//选取你scan的第一个rowkey的第一个字符
String endRow="f";
System.out.println("连接hbase");
System.out.println("开始读取"+rowStart+"后的1w个rowKey");
Scan scan = new Scan().withStartRow(rowStart.getBytes())
.setMaxVersions(99999999)
.addColumn(Bytes.toBytes("F"), Bytes.toBytes("F"))
.setLimit(10001);
Table tableRead = null;
try {
tableRead = hbaseConnection.getTable(TableName.valueOf("0_library_token"));
ResultScanner scanner = tableRead.getScanner(scan);
ArrayList rows = new ArrayList<>();
//TODO 取rowkeyS
for(Result rs:scanner){
String row = Bytes.toString(rs.getRow());
rows.add(row);
}
endRow = rows.get(rows.size()-1);
rows.remove(rows.size()-1);
System.out.println("rows的长度现在是"+rows.size());
//结束的rowKey,他是下一批次的startRow,固在上面那个阶段排除,不处理.
System.out.println("我是最后一个row"+endRow);
//1w个rowkey,500个线程,1个线程20的rowkey
//TODO 分key,将rowkey分成各20个为1组,500组对应500个线程
//key是具体的0.1.2.3线程名称
HashMap> threadMap = new HashMap<>();
int mapKey=1;//线程的key
int i=1;//每个row的计数量
ArrayList rows1 = new ArrayList<>();//作为单个线程的初始化组
for(String row:rows){
rows1.add(row);
if(i%20==0){//每20个为1组
threadMap.put(mapKey,rows1);
rows1=new ArrayList<>();
mapKey++;
}
i++;
}
//TODO 线程处理
for(int workerName:threadMap.keySet()){//线程是谁
Worker worker = new Worker(dLatch,workerName,hbaseConnection,threadMap.get(workerName));
executor.execute(worker);
}
dLatch.await();//等待其他所有子线程执行完毕再执行
executor.shutdown();
rowStart=endRow;
Instant inst2 = Instant.now();
System.out.println("******以秒计的时间差:" + Duration.between(inst1, inst2).getSeconds());
return true;
} catch (Exception e) {
// e.printStackTrace();
System.out.println("发生异常的rowKey为"+rowStart);
return false;
}
}
}
如果中间报错,根据rowKey,scan后,删除对应的1w条数据。
删除方法DeleteRow
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HbaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
public class DeleteRow {
static Connection hbaseConnection;
static String rowStart="269eba6e48e4c05afd370b59dbd94ddc";//你要扫描的rowkey(这个可以根据上面的insert插入方法的打印信息截取)
//你如果是需要截取中间的,则加一个rowEnd,我这里没有加
public static void main(String[] args) {
Configuration Hbase_CONF;
Hbase_CONF = HbaseConfiguration.create();
Hbase_CONF.set("hbase.zookeeper.property.clientPort", "2181");
Hbase_CONF.set("hbase.zookeeper.quorum", "192.168.31.71");
Hbase_CONF.set("hbase.master", "192.168.31.71:60000");
Hbase_CONF.set("zookeeper.znode.parent", "/hbase");
Hbase_CONF.setInt("hbase.hconnection.threads.max", 600);
Hbase_CONF.setInt("hbase.hconnection.threads.core", 600);
Hbase_CONF.setLong("hbase.hconnection.threads.keepalivetime", 1000);
try {
hbaseConnection = ConnectionFactory.createConnection(Hbase_CONF);//初始化连接池
//设置起始row,修改起始row
System.out.println("连接hbase");
System.out.println("开始读取"+rowStart+"后的1w个rowKey");
//下面该根据row扫描了,并且delteAll 这个row
Scan scan = new Scan().withStartRow(rowStart.getBytes())
.setMaxVersions(99999999)
.addColumn(Bytes.toBytes("F"), Bytes.toBytes("F"))
.setLimit(10001);
Table tableRead = hbaseConnection.getTable(TableName.valueOf("2_library_token"));
ResultScanner scanner = tableRead.getScanner(scan);
HashSet rowKeySet = new HashSet<>();
for(Result rs:scanner){
String row = Bytes.toString(rs.getRow());
if(!row.isEmpty()){
rowKeySet.add(row);
}
}
System.out.println(rowKeySet.size());
List deletes = new ArrayList();
for(String rowkey:rowKeySet){
Delete delete = new Delete(Bytes.toBytes(rowkey));
deletes.add(delete);
}
tableRead.delete(deletes);
} catch (IOException e) {
e.printStackTrace();
}
}
}



