栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > C/C++/C# > C#教程

C#自写的一个HTML解析类(类似XElement语法)

C#教程 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

C#自写的一个HTML解析类(类似XElement语法)

功能:

1、轻松获取指元素HTML元素。
2、可以根据属性标签进行筛选
3、返回的都是Llist强类型无需转换

 
用过XElement的都知道 用来解析XML非常的方便,但是对于HTML的格式多样化实在是没办法兼容。

所以我就写了这么一个类似XElement的 XHTMLElement

用法:

string filePath = Server.MapPath("~/file/test.htm");
      //获取HTML代码
      string mailBody = FileHelper.FileToString(filePath);

      XHtmlElement xh = new XHtmlElement(mailBody);

      //获取body的子集a标签并且class="icon"
      var link = xh.Descendants("body").ChildDescendants("a").Where(c => c.Attributes.Any(a => a.Key == "class" && a.Value == "icon")).ToList();

      //获取带href的a元素
      var links = xh.Descendants("a").Where(c => c.Attributes.Any(a => a.Key == "href")).ToList();
      foreach (var r in links)
      {
 Response.Write(r.Attributes.Single(c => c.Key == "href").Value); //出输href
      }

      //获取第一个img
      var img = xh.Descendants("img");

      //获取最近的第一个p元素以及与他同一级的其它p元素
      var ps = xh.Descendants("p");

代码:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Text;
using System.Text.Regularexpressions;

namespace SyntacticSugar
{
  /// 
  /// ** 描述:html解析类
  /// ** 创始时间:2015-4-23
  /// ** 修改时间:-
  /// ** 作者:sunkaixuan
  /// ** qq:610262374 欢迎交流,共同提高 ,命名语法等写的不好的地方欢迎大家的给出宝贵建议
  /// 
  public class XHtmlElement
  {
    private string _html;
    public XHtmlElement(string html)
    {
      _html = html;
    }

    /// 
    /// 获取最近的相同层级的HTML元素
    /// 
    /// 等于null为所有元素
    /// 
    public List Descendants(string elementName = null)
    {
      if (_html == null)
      {
 throw new ArgumentNullException("html不能这空!");
      }
      var allList = RootDescendants(_html);
      var reval = allList.Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();
      if (reval == null || reval.Count == 0)
      {
 reval = GetDescendantsSource(allList, elementName);
      }
      return reval;
    }


    /// 
    /// 获取第一级元素
    /// 
    /// 
    /// 
    public List RootDescendants(string html = null)
    {
      
      if (html == null) html = _html;
      var firstTag = Regex.Match(html, "<.+?>");

      List eleList = new List();
      List reval = new List();
      GetElementsStringList(html, ref eleList);
      foreach (var r in eleList)
      {
 HtmlInfo data = new HtmlInfo();
 data.OldFullHtml = r;
 data.SameLeveHtml = html;
 data.TagName = Regex.Match(r, @"(?<=s{1}|<)[a-z,A-Z]+(?=>|s)", RegexOptions.IgnoreCase).Value;
 data.InnerHtml = Regex.Match(r, @"(?<=>).+(?=<)", RegexOptions.Singleline).Value;
 var eleBegin = Regex.Match(r, "<.+?>").Value;
 var attrList = Regex.Matches(eleBegin, @"[a-z,A-Z]+="".+?""").Cast().Select(c => new { key = c.Value.Split('=').First(), value = c.Value.Split('=').Last().TrimEnd('"').TrimStart('"') }).ToList();
 data.Attributes = new Dictionary();
 if (attrList != null && attrList.Count > 0)
 {
   foreach (var a in attrList)
   {
     data.Attributes.Add(a.key, a.value);
   }
 }
 reval.Add(data);
      }
      return reval;

    }





    #region private
    private List GetDescendantsSource(List allList, string elementName)
    {
      foreach (var r in allList)
      {
 if (r.InnerHtml == null || !r.InnerHtml.Contains("<")) continue;
 var childList = RootDescendants(r.InnerHtml).Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();
 if (childList == null || childList.Count == 0)
 {
   childList = GetDescendantsSource(RootDescendants(r.InnerHtml), elementName);
   if (childList != null && childList.Count > 0)
     return childList;
 }
 else
 {
   return childList;
 }
      }
      return null;
    }

    private void GetElementsStringList(string html, ref List eleList)
    {
      HtmlInfo info = new HtmlInfo();
      info.TagName = Regex.Match(html, @"(?<=|s)", RegexOptions.IgnoreCase).Value;
      string currentTagBeginReg = @"";//获取当前标签元素开始标签正则
      string currentTagEndReg = @"";//获取当前标签元素收尾标签正则
      if (string.IsNullOrEmpty(info.TagName)) return;

      string eleHtml = "";
      //情况1 
      //情况2 
      //情况3  错误格式
      //情况4endif
      if (Regex.IsMatch(html, @""))//单标签
      {
 eleHtml = Regex.Match(html, @"").Value;
      }
      else if (!Regex.IsMatch(html, currentTagEndReg))//没有收尾
      {
 if (Regex.IsMatch(html, @"s{0,10}", 1);
 }
 else
 {
   eleHtml = Regex.Match(html, currentTagBeginReg,RegexOptions.Singleline).Value;
 }
      }
      else
      {
 eleHtml = GetElementString(html, currentTagBeginReg, currentTagEndReg, 1);
      }


      try
      {
 eleList.Add(eleHtml);
 html = html.Replace(eleHtml, "");
 html = Regex.Replace(html, @"", "");
 if (!Regex.IsMatch(html, @"^s*$"))
 {
   GetElementsStringList(html, ref eleList);
 }

      }
      catch (Exception ex)
      {
 throw new Exception("SORRY,您的HTML格式不能解析!!!");

      }

    }

    private string GetElementString(string html, string currentTagBeginReg, string currentTagEndReg, int i)
    {

      string newHtml = GetRegNextByNum(html, currentTagBeginReg, currentTagEndReg, i);
      var currentTagBeginMatches = Regex.Matches(newHtml, currentTagBeginReg, RegexOptions.Singleline).Cast().Select(c => c.Value).ToList();
      var currentTagEndMatches = Regex.Matches(newHtml, currentTagEndReg).Cast().Select(c => c.Value).ToList();
      if (currentTagBeginMatches.Count == currentTagEndMatches.Count)
      { //两个签标元素相等
 return newHtml;
      }
      return GetElementString(html, currentTagBeginReg, currentTagEndReg, ++i);
    }

    private string GetRegNextByNum(string val, string currentTagBeginReg, string currentTagEndReg, int i)
    {
      return Regex.Match(val, currentTagBeginReg + @"((.*?)" + currentTagEndReg + "){" + i + "}?", RegexOptions.IgnoreCase | RegexOptions.Singleline).Value;
    }
    #endregion



  }
  public static class XHtmlElementExtendsion
  {
    /// 
    /// 获取最近的相同层级的HTML元素
    /// 
    /// 等于null为所有元素
    /// 
    public static List Descendants(this IEnumerable htmlInfoList, string elementName = null)
    {
      var html = htmlInfoList.First().InnerHtml;
      XHtmlElement xhe = new XHtmlElement(html);
      return xhe.Descendants(elementName);
    }
    /// 
    /// 获取下级元素
    /// 
    /// 
    /// 
    public static List ChildDescendants(this IEnumerable htmlInfoList, string elementName = null)
    {
      var html = htmlInfoList.First().InnerHtml;
      XHtmlElement xhe = new XHtmlElement(html);
      return xhe.RootDescendants(html).Where(c => elementName == null || c.TagName == elementName).ToList();
    }

    /// 
    /// 获取父级
    /// 
    /// 
    /// 
    public static List ParentDescendant(this IEnumerable htmlInfoList,string fullHtml)
    {
      var saveLeveHtml = htmlInfoList.First().SameLeveHtml;
      string replaceGuid=Guid.NewGuid().ToString();
      fullHtml = fullHtml.Replace(saveLeveHtml,replaceGuid);
      var parentHtml = Regex.Match(fullHtml, @"<[^<]+?>[^<]*?" + replaceGuid + @".*?").Value;
      parentHtml = parentHtml.Replace(replaceGuid, saveLeveHtml);
      XHtmlElement xhe = new XHtmlElement(parentHtml);
      return xhe.RootDescendants();
    }
  }
  /// 
  /// html信息类
  /// 
  public class HtmlInfo
  {
    /// 
    /// 元素名
    /// 
    public string TagName { get; set; }
    /// 
    /// 元素属性
    /// 
    public Dictionary Attributes { get; set; }
    /// 
    /// 元素内部html
    /// 
    public string InnerHtml { get; set; }

    public string OldFullHtml { get; set; }

    public string SameLeveHtml { get; set; }

    /// 
    /// 得到元素的html
    /// 
    /// 
    public string FullHtml
    {
      get
      {
 StringBuilder reval = new StringBuilder();
 string attributesString = string.Empty;
 if (Attributes != null && Attributes.Count > 0)
 {
   attributesString = string.Join(" ", Attributes.Select(c => string.Format("{0}="{1}"", c.Key, c.Value)));
 }
 reval.AppendFormat("<{0} {2}>{1}", TagName, InnerHtml, attributesString);
 return reval.ToString();
      }
    }
  }
}

前台HTML:




  


  我是1 
  icon
  


转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/125174.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号