日期:2014-05-17 浏览次数:20642 次
int pageCount = 1; // 在抓取第一页时,填充这个变量
// 假设保存的页面为1.htm到20.htm
for(int i=1;i<=pageCount;i++){
string url = "http://www.abc.com/a.aspx?page=" + i;// 循环20页,抓取20个html
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse response = request.GetResponse() as HttpWebResponse;
Stream stream = response.GetResponseStream();
string html;
using (StreamReader reader = new StreamReader(stream))
{
html = reader.ReadToEnd();
}
Regex reg = new Regex(@"a\.aspx\?page=(\d*)", RegexOptions.Compiled | RegexOptions.IgnoreCase);
// 在抓取第一页时,填充页数变量
if(i == 1){
MatchCollection mc = reg.Matchs(html);
if(mc.Count > 0){
int.TryParse(mc[mc.Count - 1].Result("$1"), out pageCount);
}
}
// 替换抓取到的html里的a.aspx的链接,把类似a.aspx?page=分页的链接替换成"分页.htm"
html = reg.Replace(html, "$1.htm");
// 保存抓取到的html到静态文件
using(StreamWriter sw = new StreamWriter(HttpContext.Current.Server.MapPath(i + ".htm")))
{
sw.Write(html);
}
}
------解决方案--------------------