日期:2014-05-19  浏览次数:20964 次

winform中如何提取指定网页中的类似<a href="news.asp?id=3"的超连接
如标题
CSDN上都是如下的答案,可获得超连接都是http://开头的,而网页中以 <a   href= "news.asp?id=3 "   > 内容 </a>   中的news.asp?id=3如何获得?检测它是否可以正常打开?
using   System;  
using   System.Xml;  
using   System.Text;  
using   System.Net;  
using   System.IO;  
using   System.Collections;  
using   System.Text.RegularExpressions;  

public   class   App  
{  
public   static   void   Main()  
{  
string   strCode;  
ArrayList   alLinks;  

Console.Write( "请输入一个网页地址: ");  
string   strURL   =   Console.ReadLine();  
if(strURL.Substring(0,7)   !=   @ "http:// ")  
{  
strURL   =   @ "http:// "   +   strURL;  
}  

Console.WriteLine( "正在获取页面代码,请稍侯... ");  
strCode   =   GetPageSource(strURL);  

Console.WriteLine( "正在提取超链接,请稍侯... ");  
alLinks   =   GetHyperLinks(strCode);  

Console.WriteLine( "正在写入文件,请稍侯... ");  
WriteToXml(strURL,alLinks);  
}  

//   获取指定网页的HTML代码  
static   string   GetPageSource(string   URL)  
{  
Uri   uri   =new   Uri(URL);  

HttpWebRequest   hwReq   =   (HttpWebRequest)WebRequest.Create(uri);  
HttpWebResponse   hwRes   =   (HttpWebResponse)hwReq.GetResponse();  

hwReq.Method   =   "Get ";  

hwReq.KeepAlive   =   false;  

StreamReader   reader   =   new   StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding( "GB2312 "));  

return   reader.ReadToEnd();  
}  

//   提取HTML代码中的网址  
static   ArrayList   GetHyperLinks(string   htmlCode)  
{  
ArrayList   al   =   new   ArrayList();  

string   strRegex   =   @ "http://([\w-]+\.)+[\w-]+(/[\w-   ./?%&=]*)? ";  

Regex   r   =   new   Regex(strRegex,RegexOptions.IgnoreCase);  
MatchCollection   m   =   r.Matches(htmlCode);  

for(int   i=0;   i <=m.Count-1;   i++)  
{  
bool   rep   =   false;  
string   strNew   =   m[i].ToString();  

//   过滤重复的URL  
foreach(string   str   in   al)  
{  
if(strNew==str)  
{  
rep   =true;  
break;  
}  
}  

if(!rep)   al.Add(strNew);  
}  

al.Sort();  

return   al;  
}  

//   把网址写入xml文件  
static   void   WriteToXml(string   strURL,   ArrayList   alHyperLinks)  
{  
XmlTextWriter   writer   =   new   XmlTextWriter( "HyperLinks.xml ",Encoding.UTF8);  

writer.Formatting   =   Formatting.Indented;  
writer.WriteStartDocument(false);  
writer.WriteDocType( "HyperLinks ",   null,   "urls.dtd ",   null);  
writer.WriteComment( "提取自 "   +   strURL   +   "的超链接 ");