栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > C/C++/C# > C#教程

C#实现将HTML转换成纯文本的方法

C#教程 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

C#实现将HTML转换成纯文本的方法

本文实例讲述了C#实现将HTML转换成纯文本的方法。分享给大家供大家参考。具体如下:

使用方法:
复制代码 代码如下:HtmlToText convert = new HtmlToText();
textBox2.Text = convert.Convert(textBox1.Text);

C#代码如下:

/// 
/// Converts HTML to plain text.
/// 
class HtmlToText
{
  // Static data tables
  protected static Dictionary _tags;
  protected static HashSet _ignoreTags;
  // Instance variables
  protected TextBuilder _text;
  protected string _html;
  protected int _pos;
  // Static constructor (one time only)
  static HtmlToText()
  {
    _tags = new Dictionary();
    _tags.Add("address", "n");
    _tags.Add("blockquote", "n");
    _tags.Add("div", "n");
    _tags.Add("dl", "n");
    _tags.Add("fieldset", "n");
    _tags.Add("form", "n");
    _tags.Add("h1", "n");
    _tags.Add("/h1", "n");
    _tags.Add("h2", "n");
    _tags.Add("/h2", "n");
    _tags.Add("h3", "n");
    _tags.Add("/h3", "n");
    _tags.Add("h4", "n");
    _tags.Add("/h4", "n");
    _tags.Add("h5", "n");
    _tags.Add("/h5", "n");
    _tags.Add("h6", "n");
    _tags.Add("/h6", "n");
    _tags.Add("p", "n");
    _tags.Add("/p", "n");
    _tags.Add("table", "n");
    _tags.Add("/table", "n");
    _tags.Add("ul", "n");
    _tags.Add("/ul", "n");
    _tags.Add("ol", "n");
    _tags.Add("/ol", "n");
    _tags.Add("/li", "n");
    _tags.Add("br", "n");
    _tags.Add("/td", "t");
    _tags.Add("/tr", "n");
    _tags.Add("/pre", "n");
    _ignoreTags = new HashSet();
    _ignoreTags.Add("script");
    _ignoreTags.Add("noscript");
    _ignoreTags.Add("style");
    _ignoreTags.Add("object");
  }
  /// 
  /// Converts the given HTML to plain text and returns the result.
  /// 
  /// HTML to be converted
  /// Resulting plain text
  public string Convert(string html)
  {
    // Initialize state variables
    _text = new TextBuilder();
    _html = html;
    _pos = 0;
    // Process input
    while (!EndOfText)
    {
      if (Peek() == '<')
      {
 // HTML tag
 bool selfClosing;
 string tag = ParseTag(out selfClosing);
 // Handle special tag cases
 if (tag == "body")
 {
   // Discard content before 
   _text.Clear();
 }
 else if (tag == "/body")
 {
   // Discard content after 
   _pos = _html.Length;
 }
 else if (tag == "pre")
 {
   // Enter preformatted mode
   _text.Preformatted = true;
   EatWhitespaceTonextLine();
 }
 else if (tag == "/pre")
 {
   // Exit preformatted mode
   _text.Preformatted = false;
 }
 string value;
 if (_tags.TryGetValue(tag, out value))
   _text.Write(value);
 if (_ignoreTags.Contains(tag))
   EatInnerContent(tag);
      }
      else if (Char.IsWhiteSpace(Peek()))
      {
 // Whitespace (treat all as space)
 _text.Write(_text.Preformatted ? Peek() : ' ');
 MoveAhead();
      }
      else
      {
 // Other text
 _text.Write(Peek());
 MoveAhead();
      }
    }
    // Return result
    return HttpUtility.HtmlDecode(_text.ToString());
  }
  // Eats all characters that are part of the current tag
  // and returns information about that tag
  protected string ParseTag(out bool selfClosing)
  {
    string tag = String.Empty;
    selfClosing = false;
    if (Peek() == '<')
    {
      MoveAhead();
      // Parse tag name
      EatWhitespace();
      int start = _pos;
      if (Peek() == '/')
 MoveAhead();
      while (!EndOfText && !Char.IsWhiteSpace(Peek()) &&
 Peek() != '/' && Peek() != '>')
 MoveAhead();
      tag = _html.Substring(start, _pos - start).ToLower();
      // Parse rest of tag
      while (!EndOfText && Peek() != '>')
      {
 if (Peek() == '"' || Peek() == ''')
   EatQuotedValue();
 else
 {
   if (Peek() == '/')
     selfClosing = true;
   MoveAhead();
 }
      }
      MoveAhead();
    }
    return tag;
  }
  // Consumes inner content from the current tag
  protected void EatInnerContent(string tag)
  {
    string endTag = "/" + tag;
    while (!EndOfText)
    {
      if (Peek() == '<')
      {
 // Consume a tag
 bool selfClosing;
 if (ParseTag(out selfClosing) == endTag)
   return;
 // Use recursion to consume nested tags
 if (!selfClosing && !tag.StartsWith("/"))
   EatInnerContent(tag);
      }
      else MoveAhead();
    }
  }
  // Returns true if the current position is at the end of
  // the string
  protected bool EndOfText
  {
    get { return (_pos >= _html.Length); }
  }
  // Safely returns the character at the current position
  protected char Peek()
  {
    return (_pos < _html.Length) ? _html[_pos] : (char)0;
  }
  // Safely advances to current position to the next character
  protected void MoveAhead()
  {
    _pos = Math.Min(_pos + 1, _html.Length);
  }
  // Moves the current position to the next non-whitespace
  // character.
  protected void EatWhitespace()
  {
    while (Char.IsWhiteSpace(Peek()))
      MoveAhead();
  }
  // Moves the current position to the next non-whitespace
  // character or the start of the next line, whichever
  // comes first
  protected void EatWhitespaceTonextLine()
  {
    while (Char.IsWhiteSpace(Peek()))
    {
      char c = Peek();
      MoveAhead();
      if (c == 'n')
 break;
    }
  }
  // Moves the current position past a quoted value
  protected void EatQuotedValue()
  {
    char c = Peek();
    if (c == '"' || c == ''')
    {
      // Opening quote
      MoveAhead();
      // Find end of value
      int start = _pos;
      _pos = _html.IndexOfAny(new char[] { c, 'r', 'n' }, _pos);
      if (_pos < 0)
 _pos = _html.Length;
      else
 MoveAhead();  // Closing quote
    }
  }
  /// 
  /// A StringBuilder class that helps eliminate excess whitespace.
  /// 
  protected class TextBuilder
  {
    private StringBuilder _text;
    private StringBuilder _currLine;
    private int _emptyLines;
    private bool _preformatted;
    // Construction
    public TextBuilder()
    {
      _text = new StringBuilder();
      _currLine = new StringBuilder();
      _emptyLines = 0;
      _preformatted = false;
    }
    /// 
    /// Normally, extra whitespace characters are discarded.
    /// If this property is set to true, they are passed
    /// through unchanged.
    /// 
    public bool Preformatted
    {
      get
      {
 return _preformatted;
      }
      set
      {
 if (value)
 {
   // Clear line buffer if changing to
   // preformatted mode
   if (_currLine.Length > 0)
     FlushCurrLine();
   _emptyLines = 0;
 }
 _preformatted = value;
      }
    }
    /// 
    /// Clears all current text.
    /// 
    public void Clear()
    {
      _text.Length = 0;
      _currLine.Length = 0;
      _emptyLines = 0;
    }
    /// 
    /// Writes the given string to the output buffer.
    /// 
    /// 
    public void Write(string s)
    {
      foreach (char c in s)
 Write(c);
    }
    /// 
    /// Writes the given character to the output buffer.
    /// 
    /// Character to write
    public void Write(char c)
    {
      if (_preformatted)
      {
 // Write preformatted character
 _text.Append(c);
      }
      else
      {
 if (c == 'r')
 {
   // Ignore carriage returns. We'll process
   // 'n' if it comes next
 }
 else if (c == 'n')
 {
   // Flush current line
   FlushCurrLine();
 }
 else if (Char.IsWhiteSpace(c))
 {
   // Write single space character
   int len = _currLine.Length;
   if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1]))
     _currLine.Append(' ');
 }
 else
 {
   // Add character to current line
   _currLine.Append(c);
 }
      }
    }
    // Appends the current line to output buffer
    protected void FlushCurrLine()
    {
      // Get current line
      string line = _currLine.ToString().Trim();
      // Determine if line contains non-space characters
      string tmp = line.Replace(" ", String.Empty);
      if (tmp.Length == 0)
      {
 // An empty line
 _emptyLines++;
 if (_emptyLines < 2 && _text.Length > 0)
   _text.AppendLine(line);
      }
      else
      {
 // A non-empty line
 _emptyLines = 0;
 _text.AppendLine(line);
      }
      // Reset current line
      _currLine.Length = 0;
    }
    /// 
    /// Returns the current output as a string.
    /// 
    public override string ToString()
    {
      if (_currLine.Length > 0)
 FlushCurrLine();
      return _text.ToString();
    }
  }
}

希望本文所述对大家的C#程序设计有所帮助。

转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/124953.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号