日期:2014-05-18  浏览次数:20918 次

发一个解析HTML的代码.目前只能解析table与div....
代码挺简单的..但是解析的代码一定要配对出现。否则出现错误我不官。至少用来解析baidu搜索结果是没问题的
有志于写解析器的同学可以拿去玩玩

C# code


    public class SimpleHtmlParser
    {
        /// <summary>
        /// 解析函数
        /// </summary>
        /// <param name="s">解析字符串</param>
        /// <param name="elements">解析后的控件列表</param>
        /// <returns>返回控件树</returns>
        public static Element ParseHtml(string s,out List<Element> elements)
        {
            elements = new List<Element>();
            elements.Clear();
            Stack<Element> es = new Stack<Element>();
            string pattern = @"(?=(</?table.*?>)|(</?div/?.*?>))";
            RegexOptions options = RegexOptions.None | RegexOptions.IgnoreCase | RegexOptions.Singleline;
            Regex regex = new Regex(pattern, options);
            MatchCollection matches = regex.Matches(s);
            var element = new Element();
            var lastElement = element;
            foreach (Match match in matches)
            {
                var wordindex = 0;
                var wordlength = 0;
                var word = "";
                for (int i = 0; i < match.Groups.Count; i++)
                {
                    var t = match.Groups[i];
                    if (t.Length > 0)
                    {
                        wordindex = t.Index;
                        wordlength = t.Length;
                        word = t.Value;
                        break;
                    }
                }
                if (wordlength <= 0) continue;
                if (word == "<div/>") continue;
                bool isTable = word.IndexOf("table") >= 0;
                bool isDiv = word.IndexOf("div") >= 0;
                bool isEnd = word.IndexOf("</") >= 0;
                if (!isEnd)
                {
                    //新标签
                    Element ee;
                    if (isDiv)
                    {
                        ee = new DivElement();
                    }
                    else if (isTable)
                    {
                        ee = new TableElement();
                    }
                    else
                    {
                        ee = new Element();
                    }
                    ee.StartTagIndex = wordindex;
                    ee.StartTagLength = wordlength;
                    ee.BegTag = word;
                    //设定父级
                    ee.Parent = lastElement;
                    lastElement = ee;
                    ee.Parent.Children.Add(ee);
                    //进栈
                    es.Push(ee);
                }
                else
                {
                    //闭合标签
                    var t = es.Pop();
                    t.EndTag = word;
                    t.EndIndex = wordindex;
                    t.EndTagLength = wordlength;
                    lastElement = t.Parent;
                    t.OuterHtml = s.Substring(t.StartTagIndex, (t.EndIndex - t.StartTagIndex) + t.EndTagLength);
                    t.InnerHtml = s.Substring(t.StartTagIndex + t.StartTagLength, (t.EndIndex - t.StartTagIndex-t.StartTagLength));
                    elements.Add(t);
                }
            }
            return element;
        }

        //去除代码中无用的标签
        public static string ReplaceFontSpan(string s)
        {
            Regex r = new Regex("<head>.*?</head>");
            s = r.Replace(s, "");
            r = new Regex("</?font.*?>");
            s = r.Replace(s, "");
            r = new Regex("</?span.*?>");
            s = r.Replace(s, "");
            r = new Regex("</?a