日期:2014-05-17 浏览次数:20776 次
<dl id="search_773979"> <dt> <a href="read-htm-tid-773979.html" target="_blank" class="tlink">12年的二建考试(机电)有新教材了吗?变化大吗?</a> </dt> <dd> <div class="num">0条回复,1次浏览</div> <div class="text"> 今年的新教材出来了吗?</div> <div class="author"><cite>2012-09-09 12:09 -</cite>作者: <a href="u.php?uid=5454001">ficclaoshen</a> - <a href="thread-htm-fid-667.html">二级建造师—交流专版</a></div> </dd> </dl> <dl id="search_773978"> <dt> <a href="read-htm-tid-773978.html" target="_blank" class="tlink">天天来报道</a> </dt> <dd> <div class="num">1条回复,7次浏览</div> <div class="text">天天来报道,学习交流</div> <div class="author"><cite>2012-09-09 12:06 -</cite>作者: <a href="u.php?uid=9307651">z330843564</a> - <a href="thread-htm-fid-377.html">233新手交流</a></div> </dd> </dl>
string html = File.ReadAllText("html.txt",Encoding.Default);//如果是采集网页直接返回网页内容 string[] 分隔符={"id=\"search_","\">","target=\"_blank\" class=\"tlink\">","</a>","<div class=\"num\">","条回复,","次浏览</div>","thread-htm-fid-",".html"}; string[] jieguo = html.Split(分隔符,System.StringSplitOptions.RemoveEmptyEntries); int i=jieguo.Length; File.AppendAllText("jieguo.txt",jieguo[1] + Environment.NewLine + jieguo[19] + Environment.NewLine + jieguo[21] + Environment.NewLine + jieguo[22] + Environment.NewLine + jieguo[28]);
------解决方案--------------------
循环取值就可以了
string tempStr = File.ReadAllText(@"C:\Documents and Settings\Administrator\桌面\Test.txt", Encoding.GetEncoding("GB2312"));//读取txt string pattern = @"(?is)<dl[^>]*?id=(['""]?)search_(?<id_num>[^'""]+?)[^>]*?>\s*?<dt>\s*?<a[^>]*?>(?<a_text>[^<>]+?)</a>\s*?</dt>"; pattern += @"\s*?<dd>\s*?<div[^>]*?class=(['""]?)num\2[^>]*?>[^<]*?(?<reply_count>\d+)[^<]*?(?<read_count>\d+)[^<]*?</div>"; pattern += @"[\s\S]*?<div[^>]*?class=(['""]?)author\3[^>]*?>[\s\S]*?-?\s*?<a[^>]*?href=(['""]?)[^'""]*?(?<html_num>\d+)\.html?\4"; foreach (Match m in Regex.Matches(tempStr, pattern)) { //循环输出 string v1 = m.Groups["id_num"].Value;//7 string v2 = m.Groups["a_text"].Value;//12年的二建考试(机电)有新教材了吗?变化大吗? string v3 = m.Groups["reply_count"].Value;//0 string v4 = m.Groups["read_count&qu