winform中如何提取指定网页中的类似<a href="news.asp?id=3"的超连接
如标题
CSDN上都是如下的答案,可获得超连接都是http://开头的,而网页中以 <a href= "news.asp?id=3 " > 内容 </a> 中的news.asp?id=3如何获得?检测它是否可以正常打开?
using System;
using System.Xml;
using System.Text;
using System.Net;
using System.IO;
using System.Collections;
using System.Text.RegularExpressions;
public class App
{
public static void Main()
{
string strCode;
ArrayList alLinks;
Console.Write( "请输入一个网页地址: ");
string strURL = Console.ReadLine();
if(strURL.Substring(0,7) != @ "http:// ")
{
strURL = @ "http:// " + strURL;
}
Console.WriteLine( "正在获取页面代码,请稍侯... ");
strCode = GetPageSource(strURL);
Console.WriteLine( "正在提取超链接,请稍侯... ");
alLinks = GetHyperLinks(strCode);
Console.WriteLine( "正在写入文件,请稍侯... ");
WriteToXml(strURL,alLinks);
}
// 获取指定网页的HTML代码
static string GetPageSource(string URL)
{
Uri uri =new Uri(URL);
HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);
HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();
hwReq.Method = "Get ";
hwReq.KeepAlive = false;
StreamReader reader = new StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding( "GB2312 "));
return reader.ReadToEnd();
}
// 提取HTML代码中的网址
static ArrayList GetHyperLinks(string htmlCode)
{
ArrayList al = new ArrayList();
string strRegex = @ "http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)? ";
Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);
MatchCollection m = r.Matches(htmlCode);
for(int i=0; i <=m.Count-1; i++)
{
bool rep = false;
string strNew = m[i].ToString();
// 过滤重复的URL
foreach(string str in al)
{
if(strNew==str)
{
rep =true;
break;
}
}
if(!rep) al.Add(strNew);
}
al.Sort();
return al;
}
// 把网址写入xml文件
static void WriteToXml(string strURL, ArrayList alHyperLinks)
{
XmlTextWriter writer = new XmlTextWriter( "HyperLinks.xml ",Encoding.UTF8);
writer.Formatting = Formatting.Indented;
writer.WriteStartDocument(false);
writer.WriteDocType( "HyperLinks ", null, "urls.dtd ", null);
writer.WriteComment( "提取自 " + strURL + "的超链接 ");