日期:2014-05-20  浏览次数:20901 次

急:C#怎样使用正则表达式去掉字符串中所有的html标记
C#怎样使用正则表达式去掉字符串中所有的html标记
比如:
<html>
<p> Hello   world! </p>
<p> This   is   test   page </p>
<p> 1   testone </p>
<p> 2   testtwo </p>
<p> 3   testthree </p>
</html>
去掉html标记后为
Hello   world!
This   is   test   page
1   testone
2   testtwo
3   testthree

------解决方案--------------------
试下

string yourStr = ............;
string resultStr = Regex.Replace(yourStr, @ " <[\s\S]*?> ", " " , RegexOptions.IgnoreCase);

------解决方案--------------------
try..

str = Regex.Replace(str, @ " <[\s\S]*?> ", " ");
Console.WriteLine(str);

------解决方案--------------------
Regex rx = new Regex(@ "( <\/?(?!td|tr|u|table|img)[^> \/]*)\/?> | <![^> ]+> ", RegexOptions.IgnoreCase);

可以修改一下!
------解决方案--------------------
public static string ClearHtml(string strHtml)
{
string [] aryReg ={
@ " <font[^> ]*?> ",
@ " <script[^> ]*?> .*? </script> ",
@ " <(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([ " " '])(\\[ " " 'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?> ",
@ "([\r\n])[\s]+ ",
@ "&(quot|#34); ",
@ "&(amp|#38); ",
@ "&(lt|#60); ",
@ "&(gt|#62); ",
@ "&(nbsp|#160); ",
@ "&(iexcl|#161); ",
@ "&(cent|#162); ",
@ "&(pound|#163); ",
@ "&(copy|#169); ",
@ "&#(\d+); ",
@ "--> ",
@ " <!--.*\n "

};
string [] aryRep = {
" ",
" ",
" ",
" ",
"\ " ",
"& ",
" < ",
"> ",
" ",
"\xa1 ",//chr(161),
"\xa2 ",//chr(162),
"\xa3 ",//chr(163),
"\xa9 ",//chr(169),
" ",
"\r\n ",
" "
};
string newReg =aryReg[0];
string strOutput=strHtml;
for(int i = 0;i <aryReg.Length;i++)
{
Regex regex = new Regex(aryReg[i],RegexOptions.IgnoreCase );
strOutput = regex.Replace(strOutput,aryRep[i]);
}
strOutput.Replace( " < ", " ");
strOutput.Replace( "> ", " ");
strOutput.Replace( "\r\n ", " ");
strOutput.Replace( "&nbsp; ", " ");
strOutput.Replace( "&nbsp ", " ");
//strOutput.Replace( "&nb ", " ");
strOutput.Replace( " ", " ");
return "&nbsp;&nbsp; "+strOutput;