栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 面试经验 > 面试问答

从互联网下载HTML之后,字符串中的字符发生了变化

面试问答 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

从互联网下载HTML之后,字符串中的字符发生了变化

这是一个包装好的下载类,它支持gzip并检查编码标头和meta标签,以便对其正确解码。

实例化该类,然后调用

GetPage()

public class HttpDownloader{    private readonly string _referer;    private readonly string _userAgent;    public Encoding Encoding { get; set; }    public WebHeaderCollection Headers { get; set; }    public Uri Url { get; set; }    public HttpDownloader(string url, string referer, string userAgent)    {        Encoding = Encoding.GetEncoding("ISO-8859-1");        Url = new Uri(url); // verify the uri        _userAgent = userAgent;        _referer = referer;    }    public string GetPage()    {        HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);        if (!string.IsNullOrEmpty(_referer)) request.Referer = _referer;        if (!string.IsNullOrEmpty(_userAgent)) request.UserAgent = _userAgent;        request.Headers.Add(HttpRequestHeader.AcceptEncoding, "gzip,deflate");        using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())        { Headers = response.Headers; Url = response.ResponseUri; return ProcessContent(response);        }    }    private string ProcessContent(HttpWebResponse response)    {        SetEncodingFromHeader(response);        Stream s = response.GetResponseStream();        if (response.ContentEncoding.ToLower().Contains("gzip")) s = new GZipStream(s, CompressionMode.Decompress);        else if (response.ContentEncoding.ToLower().Contains("deflate")) s = new DeflateStream(s, CompressionMode.Decompress);        MemoryStream memStream = new MemoryStream();        int bytesRead;        byte[] buffer = new byte[0x1000];        for (bytesRead = s.Read(buffer, 0, buffer.Length); bytesRead > 0; bytesRead = s.Read(buffer, 0, buffer.Length))        { memStream.Write(buffer, 0, bytesRead);        }        s.Close();        string html;        memStream.Position = 0;        using (StreamReader r = new StreamReader(memStream, Encoding))        { html = r.ReadToEnd().Trim(); html = CheckmetaCharSetAndReEnpre(memStream, html);        }        return html;    }    private void SetEncodingFromHeader(HttpWebResponse response)    {        string charset = null;        if (string.IsNullOrEmpty(response.CharacterSet))        { Match m = Regex.Match(response.ContentType, @";s*charsets*=s*(?<charset>.*)", RegexOptions.IgnoreCase); if (m.Success) {     charset = m.Groups["charset"].Value.Trim(new[] { ''', '"' }); }        }        else        { charset = response.CharacterSet;        }        if (!string.IsNullOrEmpty(charset))        { try {     Encoding = Encoding.GetEncoding(charset); } catch (ArgumentException) { }        }    }    private string CheckmetaCharSetAndReEnpre(Stream memStream, string html)    {        Match m = new Regex(@"<metas+.*?charsets*=s*""?(?<charset>[A-Za-z0-9_-]+)""?", RegexOptions.Singleline | RegexOptions.IgnoreCase).Match(html);         if (m.Success)        { string charset = m.Groups["charset"].Value.ToLower() ?? "iso-8859-1"; if ((charset == "unipre") || (charset == "utf-16")) {     charset = "utf-8"; } try {     Encoding metaEncoding = Encoding.GetEncoding(charset);     if (Encoding != metaEncoding)     {         memStream.Position = 0L;         StreamReader repreReader = new StreamReader(memStream, metaEncoding);         html = repreReader.ReadToEnd().Trim();         repreReader.Close();     } } catch (ArgumentException) { }        }        return html;    }}


转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/465257.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号