HtmlAgilityPack编程 - 使用DocumentNode.InnerText去除HTML标签


HtmlAgilityPack编程 - 使用DocumentNode.InnerText去除HTML标签

HtmlAgilityPack编程 - 使用DocumentNode.InnerText去除HTML标签

C# 全选
    /// <summary>
    /// CSFramework.COM工具
    /// </summary>
    public class SpiderTool
    {

        /// <summary>
        /// 移除HTML
        /// </summary>
        /// <param name="htmlContent"></param>
        /// <returns></returns>
        public static string RemoveHTML(string htmlContent)
        {
            //移除HTML
            HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
            doc.LoadHtml(htmlContent);
            var innerText = doc.DocumentNode.InnerText.Trim();//清除HTML标签
            innerText = RemoveEmptyLine(innerText);//删除空行
            innerText = RemoveHtmlTags(innerText);//删除其他html标记
            return innerText;

        }

        /// <summary>
        /// 移除HTML标记
        /// </summary>
        /// <param name="Htmlstring"></param>
        /// <returns></returns>
        public static string RemoveHtmlTags(string Htmlstring)
        {

            if (Htmlstring.Length > 0)
            {
                //删除脚本
                Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);

                //删除HTML
                Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&ldquo;", "\"", RegexOptions.IgnoreCase);//保留【 “ 】的标点符合
                Htmlstring = Regex.Replace(Htmlstring, @"&rdquo;", "\"", RegexOptions.IgnoreCase);//保留【 ” 】的标点符合
                Htmlstring.Replace("<", "");
                Htmlstring.Replace(">", "");
                Htmlstring.Replace("\r\n", "");
            }
            return Htmlstring;

        }

        /// <summary>
        /// 去掉空行
        /// </summary>
        /// <param name="content"></param>
        /// <returns></returns>
        public static string RemoveEmptyLine(string content)
        {
            StringBuilder sb = new StringBuilder();

            string[] lines = content.Split(new string[] { "\r\n" }, StringSplitOptions.RemoveEmptyEntries);
            string tmp;
            foreach (string s in lines)
            {
                tmp = s.Trim();
                if (String.IsNullOrEmpty(tmp) || String.IsNullOrWhiteSpace(tmp))
                    continue;
                else
                    sb.AppendLine(tmp);
            }

            return sb.ToString();
        }
    }

CSCODE.NET - C/S架构Winform开发框架文库

版权声明:本文为开发框架文库发布内容,转载请附上原文出处连接
C/S框架网
上一篇:HtmlAgilityPack编程 - HtmlDocument删除html的样式
下一篇:HtmlAgilityPack编程 - 替换样式(style)的值
评论列表

发表评论

评论内容
昵称:
关联文章

HtmlAgilityPack编程 - 使用DocumentNode.InnerText去除HTML标签
使用HtmlAgilityPack.HtmlDocument彻底清除HTML标签
HtmlAgilityPack编程 - HtmlDocument删除html的样式
HtmlAgilityPack编程 - 替换样式(style)的值
Winform Html Editor 使用kindeditor组件实现winform Html 编辑器
OOP:面向对象编程
C#串口通信编程
C#使用正则表达式移除所有的Html标记,返回纯文本
C#异步编程(多线程)
el-tabs / el-tab-pane Tabs标签
ASP.NET Web Forms - HTML 页面
修改html模板
没有自定义表单技术编程量大吗?
OOP面向对象编程(1)图说OOP基础
C# OOP编程 模拟做早餐探索同步异步机制
C# CodeHighlighter生成的高亮着色HTML源码格式化
静态网页生成器 - HTML Generator
标签:C#.Net组件开发 - 设计时持久化对象数组
标签:C#.Net组件开发 - 属性窗体内显示自定义名称
robots.txt 指定 Sitemap.xml的位置和robots Meta标签[转]