日期:2014-05-17  浏览次数:20378 次

如何只读取文字,不读取样式
现在读取出的效果是
<meta name="description" content="诚聘多名毛绒玩具缝纫机工,&amp;lt;p&amp;gt;
&nbsp;&amp;lt;span&nbsp;style=&amp;quot;font-family:&nbsp;黑体&amp;quot;&amp;gt;&amp;lt;span&nbsp;style=&amp;quot;font-size:&nbsp;18px&amp;quot;&amp;gt;  本&amp;lt;strong&amp;gt;&amp;lt…玩具批发,毛绒玩具批发,玩具生产,玩具订做厂家,www.yztoy.com">

我想变成读取出来纯文字,而不是包含类似&amp;lt;p&amp;gt;这些代码的东西,请问怎么办的

------解决方案--------------------
C# code

/// <summary>
        /// 过滤掉字符串中的HTML,CSS代码
        /// </summary>
        /// <param name="inStr">字符串</param>
        /// <returns></returns>
        public static string RemoveHTML(string inStr)
        {
            string strOutput = inStr;

            var scriptRegExp = new Regex("<scr" + "ipt[^>.]*>[\\s\\S]*?</sc" + "ript>", RegexOptions.IgnoreCase & RegexOptions.Compiled & RegexOptions.Multiline & RegexOptions.ExplicitCapture);

            strOutput = scriptRegExp.Replace(strOutput, "");

            var styleRegex = new Regex("<style[^>.]*>[\\s\\S]*?</style>", RegexOptions.IgnoreCase & RegexOptions.Compiled & RegexOptions.Multiline & RegexOptions.ExplicitCapture);

            strOutput = styleRegex.Replace(strOutput, "");

            var objRegExp = new Regex("<(.|\\n)+?>", RegexOptions.IgnoreCase & RegexOptions.Compiled & RegexOptions.Multiline);

            strOutput = objRegExp.Replace(strOutput, "");

            objRegExp = new Regex("<[^>]+>", RegexOptions.IgnoreCase & RegexOptions.Compiled & RegexOptions.Multiline);

            strOutput = objRegExp.Replace(strOutput, "");

            strOutput = strOutput.Replace("&lt;", "<");
            strOutput = strOutput.Replace("&gt;", ">");
            //&nbsp; 
            strOutput = strOutput.Replace("&nbsp;", " ");

            return strOutput.Trim();
        }

------解决方案--------------------
C# code

public static string StripHTML(string strHtml)
    {
        string[] aryReg ={
                                  @"<script[^>]*?>.*?</script>",

                                  @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
                                  @"([\r\n])[\s]+",
                                  @"&(quot|#34);",
                                  @"&(amp|#38);",
                                  @"&(lt|#60);",
                                  @"&(gt|#62);", 
                                  @"&(nbsp|#160);", 
                                  @"&(iexcl|#161);",
                                  @"&(cent|#162);",
                                  @"&(pound|#163);",
                                  @"&(copy|#169);",
                                  @"&#(\d+);",
                                  @"-->",
                                  @"<!--.*\n"
                              };

        string[] aryRep = {
                                   "",
                                   "",
                                   "",
                                   "\"",
                                   "&",
                                   "<",
                                   ">",
                                   " "