日期:2014-05-18  浏览次数:20526 次

如何获取HTML字段中,所有URL地址(过滤掉其它所有)
如何获取HTML字段中,所有URL地址(过滤掉其它所有)

------解决方案--------------------
用正则表达式
------解决方案--------------------
TextBox2.Text = " ";
string web_url = this.TextBox1.Text;// "http://blog.csdn.net/21aspnet/ "
string all_code = " ";
HttpWebRequest all_codeRequest = (HttpWebRequest)WebRequest.Create(web_url);
WebResponse all_codeResponse = all_codeRequest.GetResponse();
StreamReader the_Reader = new StreamReader(all_codeResponse.GetResponseStream());
all_code = the_Reader.ReadToEnd();
the_Reader.Close();
ArrayList my_list = new ArrayList();
string p = @ "http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)? ";
Regex re = new Regex(p, RegexOptions.IgnoreCase);
MatchCollection mc = re.Matches(all_code);

for (int i = 0; i <= mc.Count - 1; i++)
{
bool _foo = false;
string name = mc[i].ToString();
foreach (string list in my_list)
{
if (name == list)
{
_foo = true;
break;
}
}//过滤

if (!_foo)
{
TextBox2.Text += name + "\n ";
}
}



------解决方案--------------------
MatchCollection matchs = Regex.Matches(HTML,@ " <a> (? <url> [^ <]*) </a> ", RegexOptions.IgnoreCase);
------解决方案--------------------
foreach (Match m in matchs)
{
str += m.Groups[ "url "].Value;
}
------解决方案--------------------
<a href=[^ <]*>