栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 面试经验 > 面试问答

HTML Agility Pack条标记不在白名单中

面试问答 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

HTML Agility Pack条标记不在白名单中

呵呵,显然我ALMOST在某人发的博客文章中找到了答案。

using System.Collections.Generic;using System.Linq;using HtmlAgilityPack;namespace Wayloop.Blog.Core.Markup{    public static class HtmlSanitizer    {        private static readonly IDictionary<string, string[]> Whitelist;        static HtmlSanitizer()        { Whitelist = new Dictionary<string, string[]> {     { "a", new[] { "href" } },     { "strong", null },     { "em", null },     { "blockquote", null },     };        }        public static string Sanitize(string input)        { var htmldocument = new Htmldocument(); htmldocument.LoadHtml(input); SanitizeNode(htmldocument.documentNode); return htmldocument.documentNode.WriteTo().Trim();        }        private static void SanitizeChildren(HtmlNode parentNode)        { for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--) {     SanitizeNode(parentNode.ChildNodes[i]); }        }        private static void SanitizeNode(HtmlNode node)        { if (node.NodeType == HtmlNodeType.Element) {     if (!Whitelist.ContainsKey(node.Name)) {         node.ParentNode.RemoveChild(node);         return;     }     if (node.HasAttributes) {         for (int i = node.Attributes.Count - 1; i >= 0; i--) {  HtmlAttribute currentAttribute = node.Attributes[i];  string[] allowedAttributes = Whitelist[node.Name];  if (!allowedAttributes.Contains(currentAttribute.Name)) {      node.Attributes.Remove(currentAttribute);  }         }     } } if (node.HasChildNodes) {     SanitizeChildren(node); }        }    }}

我从这里得到了HtmlSanitizer 显然,它不会剥离标签,但会一起删除元素。

好的,这是以后需要它的人的解决方案。

public static class HtmlSanitizer    {        private static readonly IDictionary<string, string[]> Whitelist;        private static List<string> DeletableNodesXpath = new List<string>();        static HtmlSanitizer()        { Whitelist = new Dictionary<string, string[]> {     { "a", new[] { "href" } },     { "strong", null },     { "em", null },     { "blockquote", null },     { "b", null},     { "p", null},     { "ul", null},     { "ol", null},     { "li", null},     { "div", new[] { "align" } },     { "strike", null},     { "u", null},          { "sub", null},     { "sup", null},     { "table", null },     { "tr", null },     { "td", null },     { "th", null }     };        }        public static string Sanitize(string input)        { if (input.Trim().Length < 1)     return string.Empty; var htmldocument = new Htmldocument(); htmldocument.LoadHtml(input);  SanitizeNode(htmldocument.documentNode); string xPath = HtmlSanitizer.CreateXPath(); return StripHtml(htmldocument.documentNode.WriteTo().Trim(), xPath);        }        private static void SanitizeChildren(HtmlNode parentNode)        { for (int i = parentNode.ChildNodes.Count - 1; i >= 0; i--) {     SanitizeNode(parentNode.ChildNodes[i]); }        }        private static void SanitizeNode(HtmlNode node)        { if (node.NodeType == HtmlNodeType.Element) {     if (!Whitelist.ContainsKey(node.Name))     {         if (!DeletableNodesXpath.Contains(node.Name))         {   //DeletableNodesXpath.Add(node.Name.Replace("?",""));  node.Name = "removeableNode";  DeletableNodesXpath.Add(node.Name);         }         if (node.HasChildNodes)         {  SanitizeChildren(node);         }         return;     }     if (node.HasAttributes)     {         for (int i = node.Attributes.Count - 1; i >= 0; i--)         {  HtmlAttribute currentAttribute = node.Attributes[i];  string[] allowedAttributes = Whitelist[node.Name];  if (allowedAttributes != null)  {      if (!allowedAttributes.Contains(currentAttribute.Name))      {          node.Attributes.Remove(currentAttribute);      }  }  else  {      node.Attributes.Remove(currentAttribute);  }         }     } } if (node.HasChildNodes) {     SanitizeChildren(node); }        }        private static string StripHtml(string html, string xPath)        { Htmldocument htmlDoc = new Htmldocument(); htmlDoc.LoadHtml(html); if (xPath.Length > 0) {     HtmlNodeCollection invalidNodes = htmlDoc.documentNode.SelectNodes(@xPath);     foreach (HtmlNode node in invalidNodes)     {         node.ParentNode.RemoveChild(node, true);     } } return htmlDoc.documentNode.WriteContentTo(); ;        }        private static string CreateXPath()        { string _xPath = string.Empty; for (int i = 0; i < DeletableNodesXpath.Count; i++) {     if (i != DeletableNodesXpath.Count - 1)     {         _xPath += string.Format("//{0}|", DeletableNodesXpath[i].ToString());     }     else _xPath += string.Format("//{0}", DeletableNodesXpath[i].ToString()); } return _xPath;        }    }

我重命名了该节点,因为如果必须解析XML名称空间节点,它将在xpath解析时崩溃。



转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/465390.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号