日期:2014-05-18  浏览次数:21172 次

C# 高效获取网页源码 再提取网页正文内容
各位大侠,谁能帮忙给个高效获取网页源码 再提取网页正文内容的代码,谢谢 急用

------解决方案--------------------
C# code
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(m_uri);
                response = request.GetResponse();
                if (!request.HaveResponse)
                {
                    response.Close();
                    return null;
                }
                stream = response.GetResponseStream();

                if (!response.ContentType.ToLower().StartsWith("text/"))
                {
                    return null;
                }

                string strEncoding = _DataCode.ToLower();
                if (strEncoding == "utf-8")
                    encoding = Encoding.UTF8;
                else if (strEncoding == "utf-7")
                    encoding = Encoding.UTF7;
                else if (strEncoding == "unicode")
                    encoding = Encoding.Unicode;
                else
                    encoding = Encoding.Default;

                reader = new StreamReader(stream, encoding);
                return reader.ReadToEnd();

------解决方案--------------------
C# code
System.Net.WebClient webClient = new System.Net.WebClient();
byte[] bText = webClient.DownloadData("http://www.sina.com.cn/");

//网页内容
string strContent = System.Text.UTF8Encoding.UTF8.GetString(bText);

webClient.Dispose();
webClient = null;

------解决方案--------------------
private string GetWebContent(string Url)
{
string strResult="";
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
    //声明一个HttpWebRequest请求
request.Timeout = 30000;
//设置连接超时时间
request.Headers.Set("Pragma", "no-cache");
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream streamReceive = response.GetResponseStream();
Encoding encoding = Encoding.GetEncoding("GB2312");
StreamReader streamReader = new StreamReader(streamReceive, encoding);
strResult = streamReader.ReadToEnd();
}
catch
{
MessageBox.Show("出错");
}
return strResult;
}

给你找来了。自己传个URL就行了。返回的是整个页面的源代码.至于详细的信息靠你自己来截取 

可以用正则可以用INDEXOF SUBSTRING。 最好写成写 TIMER定时windows 服务,用多线程去取去存储.程序自己跑不用操心