日期:2014-05-18  浏览次数:20417 次

抓取网页,过滤得到全部UR地址,怎么写?
抓取网页,过滤得到全部UR地址,怎么写?

------解决方案--------------------
Request.Url
------解决方案--------------------
使用javascript的XMLHttpRequest抓取目标网页.
字符串分析 <a> 标签就可以得到大多数url
------解决方案--------------------
正则表达式
------解决方案--------------------
C# code

        //采集测试
        protected void btnCrawltest_Click(object sender, EventArgs e)
        {
            btnCrawltest.Enabled = false;

            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(txtTargeturl.Text);
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(myHelper.getEncode(txtTargeturl.Text)));
           
            //绑定链接列表51aspx.com
            repeaterLinks.DataSource = getLinks(txtKeyurl.Text, sr.ReadToEnd(), txtTargeturl.Text);
            repeaterLinks.DataBind();

            sr.Close();

            btnCrawltest.Enabled = true;
            Utility.Msg.Show(this,"采集测试完成!");
        }

        /// <summary>
        /// 51aspx.com获取页面中的链接列表
        /// </summary>
        /// <param name="keyurl">链接标记</param>
        /// <param name="inStr">代码内容</param>
        /// <param name="targeturl">目标网址</param>
        /// <returns>链接列表51aspx.com</returns>
        private DataSet getLinks(string keyurl, string inStr, string targeturl)
        {
            string strLink = @"(?:<a[\s\S]*?href=['""]?(?<url>[^'""> ]+)['""]?[^>]+>(?<title>[\s\S]*?)</a>)";
            return myHelper.getResult(inStr, strLink, keyurl, targeturl, "", "");
        }

        protected void Button1_Click(object sender, EventArgs e)
        {
            lbHtml.Text = "1223345455";
        }