GBK是Windows默认的汉字编码方式,又称国标。想在控制台或是控件上显示汉字使用这种编码方式最方便。
UTF-8是一种变长字节编码方式,把字节数边长,可以容纳更多的信息。
UTF-8编码方式:
当码长为1字节的时候,兼容ascii编码,格式为0xxxxxxx (x处表示有效位)
当码长为2字节的时候,格式为110xxxxx 10xxxxxx
当码长为3字节的时候,格式为1110xxxx 10xxxxxx 10xxxxxx
当码长为4字节的时候,格式为11110xxx 10xxxxxx 10xxxxxx 10xxxxxx...
来自:https://blog.csdn.net/haohulala/article/details/86600936
· UTF8编码的码长最大为6字节,1111110x 10...
· 如果一个汉字的utf8编码为110aaaaa 10bbbbbb 则其对应的Unicode编码为aaaaabbbbbb,其他同理
所以我们就有了对于单个字符的UTF-8到Unicode的转换过程
int GetUTF8Length(unsigned char *sUTF8){
unsigned char _s0=sUTF8[0];
if( _s0 >> 7 == 0b0 )return 1;
if( _s0 >> 5 == 0b110 )return 2;
if( _s0 >> 4 == 0b1110 )return 3;
if( _s0 >> 3 == 0b11110 )return 4;
if( _s0 >> 2 == 0b111110 )return 5;
if( _s0 >> 1 == 0b1111110 )return 6;
return 0;
}
unsigned int UTF8_To_Unicode(unsigned char *sUTF8){
int l=GetUTF8Length(sUTF8);
if(l==1) return sUTF8[0];
if(l==2) return ((sUTF8[0]& 0b00011111)<<6)
+ (sUTF8[1]& 0b00111111);//0b110xxxxx
if(l==3) return ((sUTF8[0]& 0b00001111)<<12)
+ ((sUTF8[1]& 0b00111111)<<6)
+ (sUTF8[2]& 0b00111111);//0b1110xxxx
if(l==4) return ((sUTF8[0]& 0b00000111)<<18)
+ ((sUTF8[1]& 0b00111111)<<12)
+ ((sUTF8[2]& 0b00111111)<<6)
+ (sUTF8[3]& 0b00111111);//0b11110xxx
if(l==5) return ((sUTF8[0]& 0b00000011)<<24)
+ ((sUTF8[1]& 0b00111111)<<18)
+ ((sUTF8[2]& 0b00111111)<<12)
+ ((sUTF8[3]& 0b00111111)<<6)
+ (sUTF8[4]& 0b00111111);//0b111110xx
if(l==6) return ((sUTF8[0]& 0b00000001)<<30)
+ ((sUTF8[1]& 0b00111111)<<24)
+ ((sUTF8[2]& 0b00111111)<<18)
+ ((sUTF8[3]& 0b00111111)<<12)
+ ((sUTF8[4]& 0b00111111)<<6)
+ (sUTF8[5]& 0b00111111);//0b1111110x
}
我暂且还没想出一种好的算法把这几大坨if用一个简短的表达式表示,欢迎大家讨论。
至于Unicode到GBK的转换,c++有一些函数可以帮忙
char* Unicode_To_GBK(unsigned int cUni){//Unicode 16 BE
char* sGBK=new char[6];
wsprintf(sGBK,"%wc",(wchar_t)cUni);
return sGBK;
}
下面是读取汉字源码
#include#include #include //#include #include using namespace std; //void binoutput(int n){ //二进制输出 // cout< (n)<<"n"; //} int GetUTF8Length(char *sUTF8){ unsigned char _s0=sUTF8[0]; if( _s0 >> 7 == 0b0 )return 1; if( _s0 >> 5 == 0b110 )return 2; if( _s0 >> 4 == 0b1110 )return 3; if( _s0 >> 3 == 0b11110 )return 4; if( _s0 >> 2 == 0b111110 )return 5; if( _s0 >> 1 == 0b1111110 )return 6; return 0; } unsigned int UTF8_To_Unicode(unsigned char *sUTF8){ int l=GetUTF8Length(reinterpret_cast (sUTF8)); if(l==1) return sUTF8[0]; if(l==2) return ((sUTF8[0]& 0b00011111)<<6) + (sUTF8[1]& 0b00111111);//0b110xxxxx if(l==3) return ((sUTF8[0]& 0b00001111)<<12) + ((sUTF8[1]& 0b00111111)<<6) + (sUTF8[2]& 0b00111111);//0b1110xxxx if(l==4) return ((sUTF8[0]& 0b00000111)<<18) + ((sUTF8[1]& 0b00111111)<<12) + ((sUTF8[2]& 0b00111111)<<6) + (sUTF8[3]& 0b00111111);//0b11110xxx if(l==5) return ((sUTF8[0]& 0b00000011)<<24) + ((sUTF8[1]& 0b00111111)<<18) + ((sUTF8[2]& 0b00111111)<<12) + ((sUTF8[3]& 0b00111111)<<6) + (sUTF8[4]& 0b00111111);//0b111110xx if(l==6) return ((sUTF8[0]& 0b00000001)<<30) + ((sUTF8[1]& 0b00111111)<<24) + ((sUTF8[2]& 0b00111111)<<18) + ((sUTF8[3]& 0b00111111)<<12) + ((sUTF8[4]& 0b00111111)<<6) + (sUTF8[5]& 0b00111111);//0b1111110x } char* Unicode_To_GBK(unsigned int cUni){//Unicode 16 BE char* sGBK=new char[6]; wsprintf(sGBK,"%wc",(wchar_t)cUni); return sGBK; } char* UTF8_To_GBK(char *sUTF8){ return Unicode_To_GBK(UTF8_To_Unicode(reinterpret_cast (sUTF8))); } int main(){ FILE *fp=fopen("hanzi.txt","r"); char str[100]={0},*pstr,*endstr; while(!feof(fp)){ fgets(str,100,fp); pstr=str; endstr=str+strlen(str); while(pstr < endstr){ printf("%s",UTF8_To_GBK(pstr)); pstr+= GetUTF8Length(pstr); } } fclose(fp); }
提一嘴reinterpret_cast,强制转换真是太爽了,当年被一堆cannot convert from type to const type 要折磨疯了。



