栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > C/C++/C# > C#教程

c#检测文本文件编码的方法

C#教程 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

c#检测文本文件编码的方法

C#如何检测文本文件的编码,本文为大家分享了示例代码,具体内容如下

using System;
using System.Text;
using System.Text.Regularexpressions;
using System.IO;
 
namespace KlerksSoft
{
  public static class TextFileEncodingDetector
  {
    
 
    const long _defaultHeuristicSampleSize = 0x10000; //completely arbitrary - inappropriate for high numbers of files / high speed requirements
 
    public static Encoding DetectTextFileEncoding(string InputFilename, Encoding DefaultEncoding)
    {
      using (FileStream textfileStream = File.OpenRead(InputFilename))
      {
 return DetectTextFileEncoding(textfileStream, DefaultEncoding, _defaultHeuristicSampleSize);
      }
    }
 
    public static Encoding DetectTextFileEncoding(FileStream InputFileStream, Encoding DefaultEncoding, long HeuristicSampleSize)
    {
      if (InputFileStream == null)
 throw new ArgumentNullException("Must provide a valid Filestream!", "InputFileStream");
 
      if (!InputFileStream.CanRead)
 throw new ArgumentException("Provided file stream is not readable!", "InputFileStream");
 
      if (!InputFileStream.CanSeek)
 throw new ArgumentException("Provided file stream cannot seek!", "InputFileStream");
 
      Encoding encodingFound = null;
 
      long originalPos = InputFileStream.Position;
 
      InputFileStream.Position = 0;
 
      //First read only what we need for BOM detection
 
      byte[] bomBytes = new byte[InputFileStream.Length > 4 ? 4 : InputFileStream.Length];
      InputFileStream.Read(bomBytes, 0, bomBytes.Length);
 
      encodingFound = DetectBOMBytes(bomBytes);
 
      if (encodingFound != null)
      {
 InputFileStream.Position = originalPos;
 return encodingFound;
      }
 
      //BOM Detection failed, going for heuristics now.
      // create sample byte array and populate it
      byte[] sampleBytes = new byte[HeuristicSampleSize > InputFileStream.Length ? InputFileStream.Length : HeuristicSampleSize];
      Array.Copy(bomBytes, sampleBytes, bomBytes.Length);
      if (InputFileStream.Length > bomBytes.Length)
 InputFileStream.Read(sampleBytes, bomBytes.Length, sampleBytes.Length - bomBytes.Length);
      InputFileStream.Position = originalPos;
 
      //test byte array content
      encodingFound = DetectUnicodeInByteSampleByHeuristics(sampleBytes);
 
      if (encodingFound != null)
 return encodingFound;
      else
 return DefaultEncoding;
    }
 
    public static Encoding DetectTextByteArrayEncoding(byte[] TextData, Encoding DefaultEncoding)
    {
      if (TextData == null)
 throw new ArgumentNullException("Must provide a valid text data byte array!", "TextData");
 
      Encoding encodingFound = null;
 
      encodingFound = DetectBOMBytes(TextData);
 
      if (encodingFound != null)
      {
 return encodingFound;
      }
      else
      {
 //test byte array content
 encodingFound = DetectUnicodeInByteSampleByHeuristics(TextData);
 
 if (encodingFound != null)
   return encodingFound;
 else
   return DefaultEncoding;
      }
 
    }
 
    public static Encoding DetectBOMBytes(byte[] BOMBytes)
    {
      if (BOMBytes == null)
 throw new ArgumentNullException("Must provide a valid BOM byte array!", "BOMBytes");
 
      if (BOMBytes.Length < 2)
 return null;
 
      if (BOMBytes[0] == 0xff
 && BOMBytes[1] == 0xfe
 && (BOMBytes.Length < 4
   || BOMBytes[2] != 0
   || BOMBytes[3] != 0
   )
 )
 return Encoding.Unicode;
 
      if (BOMBytes[0] == 0xfe
 && BOMBytes[1] == 0xff
 )
 return Encoding.BigEndianUnicode;
 
      if (BOMBytes.Length < 3)
 return null;
 
      if (BOMBytes[0] == 0xef && BOMBytes[1] == 0xbb && BOMBytes[2] == 0xbf)
 return Encoding.UTF8;
 
      if (BOMBytes[0] == 0x2b && BOMBytes[1] == 0x2f && BOMBytes[2] == 0x76)
 return Encoding.UTF7;
 
      if (BOMBytes.Length < 4)
 return null;
 
      if (BOMBytes[0] == 0xff && BOMBytes[1] == 0xfe && BOMBytes[2] == 0 && BOMBytes[3] == 0)
 return Encoding.UTF32;
 
      if (BOMBytes[0] == 0 && BOMBytes[1] == 0 && BOMBytes[2] == 0xfe && BOMBytes[3] == 0xff)
 return Encoding.GetEncoding(12001);
 
      return null;
    }
 
    public static Encoding DetectUnicodeInByteSampleByHeuristics(byte[] SampleBytes)
    {
      long oddBinaryNullsInSample = 0;
      long evenBinaryNullsInSample = 0;
      long suspiciousUTF8SequenceCount = 0;
      long suspiciousUTF8BytesTotal = 0;
      long likelyUSASCIIBytesInSample = 0;
 
      //Cycle through, keeping count of binary null positions, possible UTF-8
      // sequences from upper ranges of Windows-1252, and probable US-ASCII
      // character counts.
 
      long currentPos = 0;
      int skipUTF8Bytes = 0;
 
      while (currentPos < SampleBytes.Length)
      {
 //binary null distribution
 if (SampleBytes[currentPos] == 0)
 {
   if (currentPos % 2 == 0)
     evenBinaryNullsInSample++;
   else
     oddBinaryNullsInSample++;
 }
 
 //likely US-ASCII characters
 if (IsCommonUSASCIIByte(SampleBytes[currentPos]))
   likelyUSASCIIBytesInSample++;
 
 //suspicious sequences (look like UTF-8)
 if (skipUTF8Bytes == 0)
 {
   int lengthFound = DetectSuspiciousUTF8SequenceLength(SampleBytes, currentPos);
 
   if (lengthFound > 0)
   {
     suspiciousUTF8SequenceCount++;
     suspiciousUTF8BytesTotal += lengthFound;
     skipUTF8Bytes = lengthFound - 1;
   }
 }
 else
 {
   skipUTF8Bytes--;
 }
 
 currentPos++;
      }
 
      //1: UTF-16 LE - in english / european environments, this is usually characterized by a
      // high proportion of odd binary nulls (starting at 0), with (as this is text) a low
      // proportion of even binary nulls.
      // The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
      // 60% nulls where you do expect nulls) are completely arbitrary.
 
      if (((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2
 && ((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6
 )
 return Encoding.Unicode;
 
      //2: UTF-16 BE - in english / european environments, this is usually characterized by a
      // high proportion of even binary nulls (starting at 0), with (as this is text) a low
      // proportion of odd binary nulls.
      // The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
      // 60% nulls where you do expect nulls) are completely arbitrary.
 
      if (((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2
 && ((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6
 )
 return Encoding.BigEndianUnicode;
 
      //3: UTF-8 - Martin Dürst outlines a method for detecting whether something CAN be UTF-8 content
      // using regexp, in his w3c.org unicode FAQ entry:
      // http://www.w3.org/International/questions/qa-forms-utf-8
      // adapted here for C#.
      string potentiallyMangledString = Encoding.ASCII.GetString(SampleBytes);
      Regex UTF8Validator = new Regex(@"A("
 + @"[x09x0Ax0Dx20-x7E]"
 + @"|[xC2-xDF][x80-xBF]"
 + @"|xE0[xA0-xBF][x80-xBF]"
 + @"|[xE1-xECxEExEF][x80-xBF]{2}"
 + @"|xED[x80-x9F][x80-xBF]"
 + @"|xF0[x90-xBF][x80-xBF]{2}"
 + @"|[xF1-xF3][x80-xBF]{3}"
 + @"|xF4[x80-x8F][x80-xBF]{2}"
 + @")*z");
      if (UTF8Validator.IsMatch(potentiallyMangledString))
      {
 //Unfortunately, just the fact that it CAN be UTF-8 doesn't tell you much about probabilities.
 //If all the characters are in the 0-127 range, no harm done, most western charsets are same as UTF-8 in these ranges.
 //If some of the characters were in the upper range (western accented characters), however, they would likely be mangled to 2-byte by the UTF-8 encoding process.
 // So, we need to play stats.
 
 // The "Random" likelihood of any pair of randomly generated characters being one
 // of these "suspicious" character sequences is:
 // 128 / (256 * 256) = 0.2%.
 //
 // In western text data, that is SIGNIFICANTLY reduced - most text data stays in the <127
 // character range, so we assume that more than 1 in 500,000 of these character
 // sequences indicates UTF-8. The number 500,000 is completely arbitrary - so sue me.
 //
 // We can only assume these character sequences will be rare if we ALSO assume that this
 // IS in fact western text - in which case the bulk of the UTF-8 encoded data (that is
 // not already suspicious sequences) should be plain US-ASCII bytes. This, I
 // arbitrarily decided, should be 80% (a random distribution, eg binary data, would yield
 // approx 40%, so the chances of hitting this threshold by accident in random data are
 // VERY low).
 
 if ((suspiciousUTF8SequenceCount * 500000.0 / SampleBytes.Length >= 1) //suspicious sequences
   && (
//all suspicious, so cannot evaluate proportion of US-Ascii
SampleBytes.Length - suspiciousUTF8BytesTotal == 0
||
likelyUSASCIIBytesInSample * 1.0 / (SampleBytes.Length - suspiciousUTF8BytesTotal) >= 0.8
     )
   )
   return Encoding.UTF8;
      }
 
      return null;
    }
 
    private static bool IsCommonUSASCIIByte(byte testByte)
    {
      if (testByte == 0x0A //lf
 || testByte == 0x0D //cr
 || testByte == 0x09 //tab
 || (testByte >= 0x20 && testByte <= 0x2F) //common punctuation
 || (testByte >= 0x30 && testByte <= 0x39) //digits
 || (testByte >= 0x3A && testByte <= 0x40) //common punctuation
 || (testByte >= 0x41 && testByte <= 0x5A) //capital letters
 || (testByte >= 0x5B && testByte <= 0x60) //common punctuation
 || (testByte >= 0x61 && testByte <= 0x7A) //lowercase letters
 || (testByte >= 0x7B && testByte <= 0x7E) //common punctuation
 )
 return true;
      else
 return false;
    }
 
    private static int DetectSuspiciousUTF8SequenceLength(byte[] SampleBytes, long currentPos)
    {
      int lengthFound = 0;
 
      if (SampleBytes.Length >= currentPos + 1
 && SampleBytes[currentPos] == 0xC2
 )
      {
 if (SampleBytes[currentPos + 1] == 0x81
   || SampleBytes[currentPos + 1] == 0x8D
   || SampleBytes[currentPos + 1] == 0x8F
   )
   lengthFound = 2;
 else if (SampleBytes[currentPos + 1] == 0x90
   || SampleBytes[currentPos + 1] == 0x9D
   )
   lengthFound = 2;
 else if (SampleBytes[currentPos + 1] >= 0xA0
   && SampleBytes[currentPos + 1] <= 0xBF
   )
   lengthFound = 2;
      }
      else if (SampleBytes.Length >= currentPos + 1
 && SampleBytes[currentPos] == 0xC3
 )
      {
 if (SampleBytes[currentPos + 1] >= 0x80
   && SampleBytes[currentPos + 1] <= 0xBF
   )
   lengthFound = 2;
      }
      else if (SampleBytes.Length >= currentPos + 1
 && SampleBytes[currentPos] == 0xC5
 )
      {
 if (SampleBytes[currentPos + 1] == 0x92
   || SampleBytes[currentPos + 1] == 0x93
   )
   lengthFound = 2;
 else if (SampleBytes[currentPos + 1] == 0xA0
   || SampleBytes[currentPos + 1] == 0xA1
   )
   lengthFound = 2;
 else if (SampleBytes[currentPos + 1] == 0xB8
   || SampleBytes[currentPos + 1] == 0xBD
   || SampleBytes[currentPos + 1] == 0xBE
   )
   lengthFound = 2;
      }
      else if (SampleBytes.Length >= currentPos + 1
 && SampleBytes[currentPos] == 0xC6
 )
      {
 if (SampleBytes[currentPos + 1] == 0x92)
   lengthFound = 2;
      }
      else if (SampleBytes.Length >= currentPos + 1
 && SampleBytes[currentPos] == 0xCB
 )
      {
 if (SampleBytes[currentPos + 1] == 0x86
   || SampleBytes[currentPos + 1] == 0x9C
   )
   lengthFound = 2;
      }
      else if (SampleBytes.Length >= currentPos + 2
 && SampleBytes[currentPos] == 0xE2
 )
      {
 if (SampleBytes[currentPos + 1] == 0x80)
 {
   if (SampleBytes[currentPos + 2] == 0x93
     || SampleBytes[currentPos + 2] == 0x94
     )
     lengthFound = 3;
   if (SampleBytes[currentPos + 2] == 0x98
     || SampleBytes[currentPos + 2] == 0x99
     || SampleBytes[currentPos + 2] == 0x9A
     )
     lengthFound = 3;
   if (SampleBytes[currentPos + 2] == 0x9C
     || SampleBytes[currentPos + 2] == 0x9D
     || SampleBytes[currentPos + 2] == 0x9E
     )
     lengthFound = 3;
   if (SampleBytes[currentPos + 2] == 0xA0
     || SampleBytes[currentPos + 2] == 0xA1
     || SampleBytes[currentPos + 2] == 0xA2
     )
     lengthFound = 3;
   if (SampleBytes[currentPos + 2] == 0xA6)
     lengthFound = 3;
   if (SampleBytes[currentPos + 2] == 0xB0)
     lengthFound = 3;
   if (SampleBytes[currentPos + 2] == 0xB9
     || SampleBytes[currentPos + 2] == 0xBA
     )
     lengthFound = 3;
 }
 else if (SampleBytes[currentPos + 1] == 0x82
   && SampleBytes[currentPos + 2] == 0xAC
   )
   lengthFound = 3;
 else if (SampleBytes[currentPos + 1] == 0x84
   && SampleBytes[currentPos + 2] == 0xA2
   )
   lengthFound = 3;
      }
 
      return lengthFound;
    }
 
  }
}

使用方法:

Encoding fileEncoding = TextFileEncodingDetector.DetectTextFileEncoding("you file path",Encoding.Default);

以上就是本文的全部内容,希望对大家学习C#程序设计有所帮助。

转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/124422.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号