日期:2014-05-18 浏览次数:20909 次
public class SimpleHtmlParser { /// <summary> /// 解析函数 /// </summary> /// <param name="s">解析字符串</param> /// <param name="elements">解析后的控件列表</param> /// <returns>返回控件树</returns> public static Element ParseHtml(string s,out List<Element> elements) { elements = new List<Element>(); elements.Clear(); Stack<Element> es = new Stack<Element>(); string pattern = @"(?=(</?table.*?>)|(</?div/?.*?>))"; RegexOptions options = RegexOptions.None | RegexOptions.IgnoreCase | RegexOptions.Singleline; Regex regex = new Regex(pattern, options); MatchCollection matches = regex.Matches(s); var element = new Element(); var lastElement = element; foreach (Match match in matches) { var wordindex = 0; var wordlength = 0; var word = ""; for (int i = 0; i < match.Groups.Count; i++) { var t = match.Groups[i]; if (t.Length > 0) { wordindex = t.Index; wordlength = t.Length; word = t.Value; break; } } if (wordlength <= 0) continue; if (word == "<div/>") continue; bool isTable = word.IndexOf("table") >= 0; bool isDiv = word.IndexOf("div") >= 0; bool isEnd = word.IndexOf("</") >= 0; if (!isEnd) { //新标签 Element ee; if (isDiv) { ee = new DivElement(); } else if (isTable) { ee = new TableElement(); } else { ee = new Element(); } ee.StartTagIndex = wordindex; ee.StartTagLength = wordlength; ee.BegTag = word; //设定父级 ee.Parent = lastElement; lastElement = ee; ee.Parent.Children.Add(ee); //进栈 es.Push(ee); } else { //闭合标签 var t = es.Pop(); t.EndTag = word; t.EndIndex = wordindex; t.EndTagLength = wordlength; lastElement = t.Parent; t.OuterHtml = s.Substring(t.StartTagIndex, (t.EndIndex - t.StartTagIndex) + t.EndTagLength); t.InnerHtml = s.Substring(t.StartTagIndex + t.StartTagLength, (t.EndIndex - t.StartTagIndex-t.StartTagLength)); elements.Add(t); } } return element; } //去除代码中无用的标签 public static string ReplaceFontSpan(string s) { Regex r = new Regex("<head>.*?</head>"); s = r.Replace(s, ""); r = new Regex("</?font.*?>"); s = r.Replace(s, ""); r = new Regex("</?span.*?>"); s = r.Replace(s, ""); r = new Regex("</?a