栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Java

乱码解析工具类优化到O(logN)水平

Java 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

乱码解析工具类优化到O(logN)水平

乱码解析工具类优化过程分析

版本一:

```java
import cn.hutool.core.collection.ConcurrentHashSet;

import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.ForkJoinTask;
import java.util.concurrent.RecursiveTask;


public class MessyCodeDetect {
    //有效字符集合,一般而言ASCII码表都是认为是有效的(因为其他编码方式都包含了ASCII码表 其他编码都是可以识别ASCII的)
    //所以ASCII一般不会出现乱码在其他编码环境中
    private static Set validSet = new ConcurrentHashSet<>();

    static {
        for (int i = 0; i <= 127; i++) {
            validSet.add((char) i);
        }
    }

    
    public static boolean isChinese(char c) {
        Character.UnicodeBlock ub = Character.UnicodeBlock.of(c);
        if (ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS
                || ub == Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS
                || ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
                || ub == Character.UnicodeBlock.GENERAL_PUNCTUATION
                || ub == Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION
                || ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
            return true;
        }
        return false;
    }

    
    public static boolean isMessyCode(String str){
        //空字符串默认不是乱码字符 直接返回
        if(str == null || str.length()==0) return false;
        char[] chars = str.toCharArray();
        //不是合法字符 也不是中文字符 那么就是乱码
        for(int i=0;i 

版本二:







```java
import cn.hutool.core.collection.ConcurrentHashSet;

import java.util.Set;
import java.util.concurrent.*;


public class MessyCodeDetect2 {
    //有效字符集合,一般而言ASCII码表都是认为是有效的(因为其他编码方式都包含了ASCII码表 其他编码都是可以识别ASCII的)
    //所以ASCII一般不会出现乱码在其他编码环境中
    private static Set validSet = new ConcurrentHashSet<>();
    static {
        for(int i=0;i<=127;i++){
            validSet.add((char)i);
        }
    }
    //统计乱码个数
    static class StatisticsMessyCode extends RecursiveTask {
        //按照这个规模阈值进行分治,效果类似生成一个节点长度为10的二叉树,使得整体时间复杂度趋近(LogN)水平
        private static final int THRESHOLD = 10;
        private char[] content;
        private int start;
        private int end;

        public StatisticsMessyCode(char[] content, int start, int end) {
            this.content = content;
            this.start = start;
            this.end = end;
        }
        
        public static boolean isChinese ( char c ) {
            Character . UnicodeBlock ub = Character . UnicodeBlock . of ( c ) ;
            if ( ub == Character . UnicodeBlock . CJK_UNIFIED_IDEOGRAPHS
                    || ub == Character . UnicodeBlock . CJK_COMPATIBILITY_IDEOGRAPHS
                    || ub == Character . UnicodeBlock . CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
                    || ub == Character . UnicodeBlock . GENERAL_PUNCTUATION
                    || ub == Character . UnicodeBlock . CJK_SYMBOLS_AND_PUNCTUATION
                    || ub == Character . UnicodeBlock . HALFWIDTH_AND_FULLWIDTH_FORMS ) {
                return true ;
            }
            return false ;
        }

        @Override
        protected Integer compute() {
            //乱码个数统计
          int messCodeNum = 0;
          //base case
          if((end-start)<=THRESHOLD){
              for(int i=start;i<=end;i++){
                  //不是有效集中的字符也不是中文--那就是乱码
                  if(!validSet.contains(content[i]) && !isChinese(content[i])) messCodeNum++;
              }
          }else{
              //分治
              //取中位数,为了防止整数溢出 不适用mid=(end+start)/2方式 而是使用下面方式取中位数
              int mid = start+((end-start)>>>1);
              StatisticsMessyCode left = new StatisticsMessyCode(content, start, mid);
              StatisticsMessyCode right = new StatisticsMessyCode(content, mid + 1, end);
              left.fork();
              right.fork();
              //join 最后统计综合
              messCodeNum = left.join()+ right.join();
          }
          return messCodeNum;
        }
    }

    public static void main(String[] args) throws ExecutionException, InterruptedException {
        String str  = "ʡʈÂĹ【ƬqñǪau008Ao̜̊9ɌǪWEȶÇɅ]ý}ʂÃĹʼnau008Ao̜n" +
                "̊9]ʼnWEʔ【ǯuάɫˠ【ƬqñǪZɌǪȏĚʡʈ˫yϔǵάБŅɅЇÁn" +
                "ĥau008Ao̜̊9ĘͰ}WY9ÁĥWY9ɌÁĥWEˠŮǭcȇʞģʅ˾Yu009E¡n" +
                "¡【Ƭau008Ao̜̊9ɌǪ【ƬWEƐĘͰ}ĘͰ(t(n" +
                "ƲƼ̖úMɈβ˱Ɉͯάçn" +
                "au008Ao̜̊9ɌɵĘͰWEqżǽu008EĘͰ}ˠUПʔ【РϼZɌǪɪȌ[]ý}n" +
                "ʂÃĹʼnÁĥʔ?au008Ao̜̊9[Áĥʔ?WEY[]ʼn}ʂÃĹʼn]ʼn}ʂϱn" +
                "Ͱʔ?au008Ao̜̊9[}ʂϱͰʔ?WE";
        System.out.println("字符个数 :"+str.length());
        StatisticsMessyCode messyCodeCount = new StatisticsMessyCode(str.toCharArray(),0,str.length()-1);
        ForkJoinPool pool = new ForkJoinPool();
        ForkJoinTask future = pool.submit(messyCodeCount);
        Integer aLong = future.get();
        System.out.println("乱码个数 : "+ aLong);
        System.out.println("非乱码个数 : "+ (str.length()-aLong));
        pool.shutdown();
    }

}

版本三:

 import cn.hutool.core.collection.ConcurrentHashSet;

import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.ForkJoinTask;
import java.util.concurrent.RecursiveTask;


@Slf4j
public class MessyCodeDetect3 {
    //有效字符集合,一般而言ASCII码表都是认为是有效的(因为其他编码方式都包含了ASCII码表 其他编码都是可以识别ASCII的)
    //所以ASCII一般不会出现乱码在其他编码环境中
    private static Set validSet = new ConcurrentHashSet<>();
    static {
        for(int i=0;i<=127;i++){
            validSet.add((char)i);
        }
    }
    //统计乱码个数类
    static class StatisticsMessyCode extends RecursiveTask {
        //按照这个规模阈值进行分治,效果类似生成一个节点长度为10的二叉树,使得整体时间复杂度趋近(LogN)水平
        //这个validSet阈值大家可以根据自己的系统进行批量压测调出最优参数
        private static final int THRESHOLD = 10;
        private char[] content;
        private int start;
        private int end;

        public StatisticsMessyCode(char[] content, int start, int end) {
            this.content = content;
            this.start = start;
            this.end = end;
        }
        
        public static boolean isChinese ( char c ) {
            Character . UnicodeBlock ub = Character . UnicodeBlock . of ( c ) ;
            if ( ub == Character . UnicodeBlock . CJK_UNIFIED_IDEOGRAPHS
                    || ub == Character . UnicodeBlock . CJK_COMPATIBILITY_IDEOGRAPHS
                    || ub == Character . UnicodeBlock . CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A
                    || ub == Character . UnicodeBlock . GENERAL_PUNCTUATION
                    || ub == Character . UnicodeBlock . CJK_SYMBOLS_AND_PUNCTUATION
                    || ub == Character . UnicodeBlock . HALFWIDTH_AND_FULLWIDTH_FORMS ) {
                return true ;
            }
            return false ;
        }

        @Override
        protected Integer compute() {
            //乱码个数统计
          int messCodeNum = 0;
          //base case
          if((end-start)<=THRESHOLD){
              for(int i=start;i<=end;i++){
                  //不是有效集中的字符也不是中文--那就是乱码
                  if(!validSet.contains(content[i]) && !isChinese(content[i])) messCodeNum++;
              }
          }else{
              //分治
              //取中位数,为了防止整数溢出 不适用mid=(end+start)/2方式 而是使用下面方式取中位数
              int mid = start+((end-start)>>>1);
              StatisticsMessyCode left = new StatisticsMessyCode(content, start, mid);
              StatisticsMessyCode right = new StatisticsMessyCode(content, mid + 1, end);
              left.fork();
              right.fork();
              //join 最后统计综合
              messCodeNum = left.join()+ right.join();
          }
          return messCodeNum;
        }
    }

    
    public static Integer getMessyCode(String content) throws ExecutionException, InterruptedException {
       StatisticsMessyCode messyCodeCount = new StatisticsMessyCode(content.toCharArray(),0,content.length()-1);
        ForkJoinPool pool = new ForkJoinPool();
        ForkJoinTask future = pool.submit(messyCodeCount);
        Integer messyCodeSum = future.get();
        pool.shutdown();
        return messyCodeSum;
    }

    
    public static  boolean isMessyCode(String content,float messyRate) throws ExecutionException, InterruptedException {
        log.info("messy-code detection start");
        long curTime = System.currentTimeMillis();
        Integer messyCodeSum = getMessyCode(content);
        log.info("messy-code detection finish,it costs time {}",(System.currentTimeMillis()-curTime));
        return (messyCodeSum/content.length())>messyRate;
    }
}
转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/865285.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号