日期:2014-05-17  浏览次数:20592 次

过滤HTML代码,只留下文本
    using System; 
  using System.Web; 
  using System.Text.RegularExpressions;
 public static string NoHTML(string Htmlstring) 
  { 
  //删除剧本 
  Htmlstring = Regex.Replace(Htmlstring,@"<script[^>]*?>.*?</script>", "",RegexOptions.IgnoreCase); 
  //删除HTML 
  Htmlstring = Regex.Replace(Htmlstring,@"<(.[^>]*)>","",RegexOption s.IgnoreCase); 
  Htmlstring = Regex.Replace(Htmlstring,@"([\r\n])[\s]+","",Regex Options.IgnoreCase); 
  Htmlstring = Regex.Replace(Htmlstring,@"-->","",RegexOptions.Ign oreCase); 
  Htmlstring = Regex.Replace(Htmlstring,@"<!--.*","",RegexOptions. IgnoreCase); 
  Htmlstring = Regex.Replace(Htmlstring,@"&(quot|#34);","\"",Rege xOptions.IgnoreCase); 
  Htmlstring = Regex.Replace(Htmlstring,@"&(amp|#38);","&",RegexO ptions.IgnoreCase); 
  Htmlstring = Regex.Replace(Htmlstring,@"&(lt|#60);","<",RegexOpt ions.IgnoreCase); 
  Htmlstring = Regex.Replace(Htmlstring,@"&(gt|#62);",">",RegexOpt ions.IgnoreCase); 
  Htmlstring = Regex.Replace(Htmlstring,@"&(nbsp|#160);"," ",RegexOptions.IgnoreCase); 
  Htmlstring = Regex.Replace(Htmlstring,@"&(iexcl|#161);","\xa1", RegexOptions.IgnoreCase); 
  Htmlstring = Regex.Replace(Htmlstring,@"&(cent|#162);","\xa2",R egexOptions.IgnoreCase); 
  Htmlstring = Regex.Replace(Htmlstring,@"&(pound|#网易);","\xa3", RegexOptions.IgnoreCase); 
  Htmlstring = Regex.Replace(Htmlstring,@"&(copy|#169);","\xa9",R egexOptions.IgnoreCase); 
  Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);","",RegexOptions.IgnoreCase); 
  Htmlstring.Replace("<",""); 
  Htmlstring.Replace(">",""); 
  Htmlstring.Replace("\r\n",""); 
  Htmlstring=HttpContext.Current.Server.HtmlEncode(H tmlstring).Trim(); 
  return Htmlstring; 
  } 

?

/**////提取HTML代码中书契的C#函数 
  /// <summary> 
  /// 去除HTML标记 
  /// </summary> 
  /// <param name="strHtml">包孕HTML的源码 </param> 
  /// <returns>已经去除后的书契</returns> 
  using System; 
  using System.Text.RegularExpressions; 
  public class StripHTMLTest{ 
  public static void Main(){ 
  string s=StripHTML("<HTML><HEAD><TITLE>资料测试</TITLE></HEAD><BODY>信息</BODY></HTML>"); 
  Console.WriteLine(s); 
  } 
  public static string StripHTML(string strHtml){ 
  string [] aryReg ={ 
  @"<script[^>]*?>.*?</script>", 
  @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>", 
  @"([\r\n])[\s]+", 
  @"&(quot|#34);", 
  @"&(amp|#38);", 
  @"&(lt|#60);", 
  @"&(gt|#62);", 
  @"&(nbsp|#160);", 
  @"&(iexcl|#161);", 
  @"&(cent|#162);", 
  @"&(pound|#网易);", 
  @"&(copy|#169);", 
  @"&#(\d+);", 
  @"-->", 
  @"<!--.*\n" 
  }; 
  string [] aryRep = { 
  "", 
  "", 
  "", 
  "\"", 
  "&", 
  "<", 
  ">", 
  " ", 
  "\xa1",//chr(161), 
  "\xa2",//chr(162), 
  "\xa3",//chr(网易), 
  "\xa9",//chr(169), 
  "", 
  "\r\n", 
  "" 
  }; 
  string newReg =aryReg[0]; 
  string strOutput=strHtml; 
  for(int i = 0;i<aryReg.Length;i++){ 
  Regex regex = new Regex(aryReg[i],RegexOptions.IgnoreCase); 
  strOutput = regex.Replace(strOutput,aryRep[i]); 
  } 
  strOutput.Replace("<",""); 
  strOutput.Replace(">",""); 
  strOutput.Replace("\r\n",""); 
  return strOutput; 
  } 
  } 
  写一个静态要领 
  移除HTML标签#region 移除HTML标签 
  /**//// <summary> 
  /// 移除HTML标签 
  /// </summary> 
  /// <param name="HTMLStr">HTMLStr</param> 
  public static string ParseTags(string HTMLStr) 
  { 
  return System.Text.RegularExpressions.Regex.Replace(HTMLS tr, "<[^>]*>", ""); 
  } 

?

 取出文本中的图片地址#region 取出文本中的图片地址 
  /**//// <summary> 
  /// 取出文本中的图片地址 
  /// </summary> 
  /// <param name="HTMLStr">HTMLStr</param> 
  public static string GetImgUrl(string HTMLStr) 
  { 
  string