日期:2014-05-17 浏览次数:21006 次
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
using System.IO.Compression;
using System.Net;
using System.Windows.Forms;
using System.Web;
namespace SearchEngine
{
class Crawl
{
string patternCode = @"<meta[\s\S]+?charset=[\s]*[""]?(.*?)""[\s]*[\S]?>";//用于分析网站编码
string patternUrl = "<a\\s+href\\s*=\\s*\"?(.*?)[\"|>]";
//content="text/html; charset=gb2312"
string filePath = @"E:\学习\各种项目\智能搜索\htmldownload\";//文件保存路径
string dlErrorName = @"E:\学习\各种项目\智能搜索\htmldownload\downloadErrorLog.txt";
string getCodeErrorName = @"E:\学习\各种项目\智能搜索\htmldownload\getCodeErrorLog.txt";
/// <summary>
/// 下载指定url的html文档,并保存在本地
/// </summary>
/// <param name="url"></param>
/// <returns>返回html文本</returns>
public string download(string url)
{
string encoding="ASSIC";
string html=string.Empty;
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 5000;
request.Method = "get";
request.ContentType = "text/html";
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream resStream = response.GetResponseStream();
StreamReader strRead;
encoding = getEncoding(url);
//判断网页是否经过gzip压缩,如果是则解压缩
if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringCompari