版本一:
```java
import cn.hutool.core.collection.ConcurrentHashSet;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.ForkJoinTask;
import java.util.concurrent.RecursiveTask;
public class MessyCodeDetect {
//有效字符集合,一般而言ASCII码表都是认为是有效的(因为其他编码方式都包含了ASCII码表 其他编码都是可以识别ASCII的)
//所以ASCII一般不会出现乱码在其他编码环境中
private static Set validSet = new ConcurrentHashSet<>();
static {
for (int i = 0; i <= 127; i++) {
validSet.add((char) i);
}
}
public static boolean isChinese(char c) {
Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
|| ub == Character.UnicodeBlock.GENERAL_PUNCTUATION
|| ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
|| ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
return true;
}
return false;
}
public static boolean isMessyCode(String str){
//空字符串默认不是乱码字符 直接返回
if(str == null || str.length()==0) return false;
char[] chars = str.toCharArray();
//不是合法字符 也不是中文字符 那么就是乱码
for(int i=0;i
版本二:
```java
import cn.hutool.core.collection.ConcurrentHashSet;
import java.util.Set;
import java.util.concurrent.*;
public class MessyCodeDetect2 {
//有效字符集合,一般而言ASCII码表都是认为是有效的(因为其他编码方式都包含了ASCII码表 其他编码都是可以识别ASCII的)
//所以ASCII一般不会出现乱码在其他编码环境中
private static Set validSet = new ConcurrentHashSet<>();
static {
for(int i=0;i<=127;i++){
validSet.add((char)i);
}
}
//统计乱码个数
static class StatisticsMessyCode extends RecursiveTask {
//按照这个规模阈值进行分治,效果类似生成一个节点长度为10的二叉树,使得整体时间复杂度趋近(LogN)水平
private static final int THRESHOLD = 10;
private char[] content;
private int start;
private int end;
public StatisticsMessyCode(char[] content, int start, int end) {
this.content = content;
this.start = start;
this.end = end;
}
public static boolean isChinese ( char c ) {
Character . UnicodeBlock ub = Character . UnicodeBlock . of ( c ) ;
if ( ub == Character . UnicodeBlock . CJK_UNIFIED_IDEOGRAPHS
|| ub == Character . UnicodeBlock . CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character . UnicodeBlock . CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
|| ub == Character . UnicodeBlock . GENERAL_PUNCTUATION
|| ub == Character . UnicodeBlock . CJK_SYMBOLS_AND_PUNCTUATION
|| ub == Character . UnicodeBlock . HALFWIDTH_AND_FULLWIDTH_FORMS ) {
return true ;
}
return false ;
}
@Override
protected Integer compute() {
//乱码个数统计
int messCodeNum = 0;
//base case
if((end-start)<=THRESHOLD){
for(int i=start;i<=end;i++){
//不是有效集中的字符也不是中文--那就是乱码
if(!validSet.contains(content[i]) && !isChinese(content[i])) messCodeNum++;
}
}else{
//分治
//取中位数,为了防止整数溢出 不适用mid=(end+start)/2方式 而是使用下面方式取中位数
int mid = start+((end-start)>>>1);
StatisticsMessyCode left = new StatisticsMessyCode(content, start, mid);
StatisticsMessyCode right = new StatisticsMessyCode(content, mid + 1, end);
left.fork();
right.fork();
//join 最后统计综合
messCodeNum = left.join()+ right.join();
}
return messCodeNum;
}
}
public static void main(String[] args) throws ExecutionException, InterruptedException {
String str = "ʡʈÂĹ【ƬqñǪau008Ao̜̊9ɌǪWEȶÇɅ]ý}ʂÃĹʼnau008Ao̜n" +
"̊9]ʼnWEʔ【ǯuάɫˠ【ƬqñǪZɌǪȏĚʡʈ˫yϔǵάБŅɅЇÁn" +
"ĥau008Ao̜̊9ĘͰ}WY9ÁĥWY9ɌÁĥWEˠŮǭcȇʞģʅ˾Yu009E¡n" +
"¡【Ƭau008Ao̜̊9ɌǪ【ƬWEƐĘͰ}ĘͰ(t(n" +
"ƲƼ̖úMɈβ˱Ɉͯάçn" +
"au008Ao̜̊9ɌɵĘͰWEqżǽu008EĘͰ}ˠUПʔ【РϼZɌǪɪȌ[]ý}n" +
"ʂÃĹʼnÁĥʔ?au008Ao̜̊9[Áĥʔ?WEY[]ʼn}ʂÃĹʼn]ʼn}ʂϱn" +
"Ͱʔ?au008Ao̜̊9[}ʂϱͰʔ?WE";
System.out.println("字符个数 :"+str.length());
StatisticsMessyCode messyCodeCount = new StatisticsMessyCode(str.toCharArray(),0,str.length()-1);
ForkJoinPool pool = new ForkJoinPool();
ForkJoinTask future = pool.submit(messyCodeCount);
Integer aLong = future.get();
System.out.println("乱码个数 : "+ aLong);
System.out.println("非乱码个数 : "+ (str.length()-aLong));
pool.shutdown();
}
}
版本三:
import cn.hutool.core.collection.ConcurrentHashSet;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.ForkJoinTask;
import java.util.concurrent.RecursiveTask;
@Slf4j
public class MessyCodeDetect3 {
//有效字符集合,一般而言ASCII码表都是认为是有效的(因为其他编码方式都包含了ASCII码表 其他编码都是可以识别ASCII的)
//所以ASCII一般不会出现乱码在其他编码环境中
private static Set validSet = new ConcurrentHashSet<>();
static {
for(int i=0;i<=127;i++){
validSet.add((char)i);
}
}
//统计乱码个数类
static class StatisticsMessyCode extends RecursiveTask {
//按照这个规模阈值进行分治,效果类似生成一个节点长度为10的二叉树,使得整体时间复杂度趋近(LogN)水平
//这个validSet阈值大家可以根据自己的系统进行批量压测调出最优参数
private static final int THRESHOLD = 10;
private char[] content;
private int start;
private int end;
public StatisticsMessyCode(char[] content, int start, int end) {
this.content = content;
this.start = start;
this.end = end;
}
public static boolean isChinese ( char c ) {
Character . UnicodeBlock ub = Character . UnicodeBlock . of ( c ) ;
if ( ub == Character . UnicodeBlock . CJK_UNIFIED_IDEOGRAPHS
|| ub == Character . UnicodeBlock . CJK_COMPATIBILITY_IDEOGRAPHS
|| ub == Character . UnicodeBlock . CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
|| ub == Character . UnicodeBlock . GENERAL_PUNCTUATION
|| ub == Character . UnicodeBlock . CJK_SYMBOLS_AND_PUNCTUATION
|| ub == Character . UnicodeBlock . HALFWIDTH_AND_FULLWIDTH_FORMS ) {
return true ;
}
return false ;
}
@Override
protected Integer compute() {
//乱码个数统计
int messCodeNum = 0;
//base case
if((end-start)<=THRESHOLD){
for(int i=start;i<=end;i++){
//不是有效集中的字符也不是中文--那就是乱码
if(!validSet.contains(content[i]) && !isChinese(content[i])) messCodeNum++;
}
}else{
//分治
//取中位数,为了防止整数溢出 不适用mid=(end+start)/2方式 而是使用下面方式取中位数
int mid = start+((end-start)>>>1);
StatisticsMessyCode left = new StatisticsMessyCode(content, start, mid);
StatisticsMessyCode right = new StatisticsMessyCode(content, mid + 1, end);
left.fork();
right.fork();
//join 最后统计综合
messCodeNum = left.join()+ right.join();
}
return messCodeNum;
}
}
public static Integer getMessyCode(String content) throws ExecutionException, InterruptedException {
StatisticsMessyCode messyCodeCount = new StatisticsMessyCode(content.toCharArray(),0,content.length()-1);
ForkJoinPool pool = new ForkJoinPool();
ForkJoinTask future = pool.submit(messyCodeCount);
Integer messyCodeSum = future.get();
pool.shutdown();
return messyCodeSum;
}
public static boolean isMessyCode(String content,float messyRate) throws ExecutionException, InterruptedException {
log.info("messy-code detection start");
long curTime = System.currentTimeMillis();
Integer messyCodeSum = getMessyCode(content);
log.info("messy-code detection finish,it costs time {}",(System.currentTimeMillis()-curTime));
return (messyCodeSum/content.length())>messyRate;
}
}



