日期:2014-05-20  浏览次数:21119 次

Alexa自动采集器出现问题?
前段时间做了一个Alexa流量数据自动采集工具,是用C# Winform做的,窗体上有一个WebBrowser控件,窗体加载后,添加需要采集的网站列表,然后访问Http://cn.alexa.com采集,在这里采集到的数据是没问题的,接下来去http://alexa.chinaz.com采集IP和PV值的时候,经常会出现访问后是空白页,然后程序就会停在那里了,很奇怪是什么问题?

代码如下:
C# code

bool flagExit = true;//标记是否退出,true为不退出,false为退出
        List<Alexa> list = new List<Alexa>();
        int index = 0;//官网下标
        int indexChinaz = 0;//Chinaz下标
        bool startRecord = false;//标记是否从官网开始采集
        bool startChinaz = false;//标记是否从Chinaz开始采集
        bool IsRecording = false;//标记是否开始采集,仅当前时间为13点时,为True
        StringBuilder sb = new StringBuilder();
        HtmlDocument document = null;
        private void Form1_Load(object sender, EventArgs e)
        {
            //窗体加载后,暂停2秒
            System.Threading.Thread.Sleep(2000);
            list.Add(new Alexa { DomainName = "她时代", DomainUrl = "smartshe.com" });
            list.Add(new Alexa { DomainName = "elle", DomainUrl = "ellechina.com" });
            list.Add(new Alexa { DomainName = "onlylady", DomainUrl = "onlylady.com" });
            list.Add(new Alexa { DomainName = "pclady", DomainUrl = "pclady.com.cn" });
            list.Add(new Alexa { DomainName = "yoka", DomainUrl = "yoka.com" });
            list.Add(new Alexa { DomainName = "贝太厨房", DomainUrl = "bettyskitchen.com.cn" });
            list.Add(new Alexa { DomainName = "都市主妇", DomainUrl = "herschina.com" });
            list.Add(new Alexa { DomainName = "嘉人", DomainUrl = "marieclairechina.com" });
            list.Add(new Alexa { DomainName = "女友", DomainUrl = "ny1988.com" });
            list.Add(new Alexa { DomainName = "瑞丽", DomainUrl = "rayli.com.cn" });
            list.Add(new Alexa { DomainName = "时尚", DomainUrl = "trends.com.cn" });
            list.Add(new Alexa { DomainName = "悦己", DomainUrl = "self.com.cn" });
            timer1.Enabled = true;
            webBrowser1.Navigate(new Uri(@"about:blank"));
            tboCurUrl.Text = "about:blank";
        }

        private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)//WebBrowser加载完成事件
        {
            if ((e.Url.ToString() == webBrowser1.Url.ToString() || webBrowser1.ReadyState == WebBrowserReadyState.Complete) && startChinaz)
            {
                SetIPPv();
                GetAlexaFromChinaz(indexChinaz);
            }
            if ((e.Url.ToString() == webBrowser1.Url.ToString() || webBrowser1.ReadyState == WebBrowserReadyState.Complete) && startRecord)
            {
                //设置Alexa信息
                SetAlexaInfo();

                start(index);
            }
        }

        /// <summary>
        /// 从官网读取Alexa流量信息
        /// </summary>
        /// <param name="num"></param>
        private void start(int num)//根据网站地址访问Alexa官网
        {
            if (num < list.Count)
            {
                timer1.Enabled = false;
                IsRecording = true;
                //暂停三秒
                System.Threading.Thread.Sleep(1000);
                webBrowser1.Navigate(new Uri(@"http://cn.alexa.com/siteinfo/" + list[num].DomainUrl + "#trafficstats"));
                tboCurUrl.Text = "http://cn.alexa.com/siteinfo/" + list[num].DomainUrl + "#trafficstats";
            }
            else
            {
                startRecord = false;
                startChinaz = true;//标记开始读取IP、PV值
                GetAlexaFromChinaz(0);
            }
        }

        /// <summary>
        /// 访问Chinz,以获取IP和PV值
        /// </summary>
        /// <param name="num"></param>
        private void GetAlexaFromChinaz(int num)//根据网站地址访问站长之家
        {
            if (num < list.Count)
            {
                //暂停五秒
                System.Threading.Thread.Sleep(5000);
                webBrowser1.Navigate(new Uri(@"http://alexa.chinaz.com/?domain=" + list[num].DomainUrl));
                tboCurUrl.Text = "http://alexa.chinaz.com/?domain=" + list[num].DomainUrl;
            }
            else
            {
                //自动执行导出方法
                //ExportToExcel();
                ExportToSql();
                timer1.Enabled = true;
                startRecord = false;
                startChinaz = false;
                index = 0;
                indexChinaz = 0;
                webBrowser1.Navigate(new Uri(@"http://www.smartshe.com"));
                tboCurUrl.Text = "http://www.smartshe.com";
            }
        }

        /// <summary>
        /// 设置集合中各对象的IP和PV值
        /// </summary>
        private void SetIPPv()//设置集合中各对象的IP和PV值
        {
            document = webBrowser1.Document;
            list[indexChinaz].IpNum = document.GetElementById("IpNum").InnerText == "相关数据不充分,无法统计。" ? "-" : document.GetElementById("IpNum").InnerText.Substring(2).Replace(",", "");
            list[indexChinaz].PvNum = document.GetElementById("PvNum").InnerText == "相关数据不充分,无法统计。" ? "-" : document.GetElementById("PvNum").InnerText.Substring(2).Replace(",", "");
            indexChinaz += 1;
        }

        /// <summary>
        /// 设置集合中各对象的排名信息
        /// </summary>
        private void SetAlexaInfo()//设置集合中各对象的排名信息
        {
            document = webBrowser1.Document;
            sb.Remove(0, sb.ToString().Length);
            sb.Append(document.GetElementById("rank").InnerText);
            //string ranks = document.GetElementById("rank").InnerText;
            //使用正则匹配得出(昨日排名、最近七天、最近七天)
            Match m = Regex.Match(sb.ToString(), @"昨日([\d,]+)", RegexOptions.IgnoreCase);
            if (m.Success)
            {
                list[index].TodayRank = m.Groups[1].Value.Replace(",", "");
            }
            else
            {
                list[index].TodayRank = "-";
            }

            m = Regex.Match(sb.ToString(), @"最近七天平均([\d,]+)", RegexOptions.IgnoreCase);
            if (m.Success)
            {
                list[index].WeekRank = m.Groups[1].Value.Replace(",", "");
            }
            else
            {
                list[index].WeekRank = "-";
            }

            m = Regex.Match(sb.ToString(), @"最近一月平均([\d,]+)", RegexOptions.IgnoreCase);
            if (m.Success)
            {
                list[index].MonthRank = m.Groups[1].Value.Replace(",", "");
            }
            else
            {
                list[index].MonthRank = "-";
            }
            //综合排名

            sb.Remove(0, sb.ToString().Length);
            sb.Append(document.GetElementById("siteStats").InnerText);
            //string rank = document.GetElementById("siteStats").InnerText;
            m = Regex.Match(sb.ToString().Replace("\r\n", ""), @"([\d,]+)网站流量排名", RegexOptions.IgnoreCase);
            if (m.Success)
            {
                list[index].Rank = m.Groups[1].Value.Replace(",", "");
            }
            else
            {
                list[index].Rank = "-";
            }

            sb.Remove(0, sb.ToString().Length);
            sb.Append(document.GetElementById("trafficstats").InnerText);
            //string china = document.GetElementById("trafficstats").InnerText;
            m = Regex.Match(sb.ToString(), @"([\d,]+)   China", RegexOptions.IgnoreCase);
            if (m.Success)
            {
                list[index].ChinaRank = m.Groups[1].Value.Replace(",", "");
            }
            else
            {
                list[index].ChinaRank = "-";
            }
            index += 1;
        }

        /// <summary>
        /// 将结果插入至数据库
        /// </summary>
        private void ExportToSql()//导入数据库
        {
            foreach (Alexa alexa in list)
            {
                DBHelp.ExecuteNonQuery(alexa);
            }
        }

        private void GoUrl()//根据URL访问网站
        {
            if (tboCurUrl.Text.Trim().StartsWith("http://www."))
            {
                webBrowser1.Navigate(new Uri(@"" + tboCurUrl.Text.Trim()));
            }
            else if (tboCurUrl.Text.Trim().StartsWith("www."))
            {

                webBrowser1.Navigate(new Uri(@"http://" + tboCurUrl.Text.Trim()));
            }
            else if (tboCurUrl.Text.Trim().StartsWith("http://"))
            {
                webBrowser1.Navigate(new Uri(@"" + tboCurUrl.Text.Trim()));
            }
            else
            {
                webBrowser1.Navigate(new Uri(@"http://" + tboCurUrl.Text.Trim()));
            }
        }

        /// <summary>
        /// 最小化
        /// </summary>
        private void NormalToMinimized()//最小化
        {
            this.Visible = false;
            this.WindowState = FormWindowState.Minimized;
            notifyIcon1.Visible = true;

        }

        /// <summary>
        /// 恢复正常窗体大小
        /// </summary>
        private void MinimizedToNormal()//恢复正常
        {
            this.Visible = true;
            this.WindowState = FormWindowState.Normal;
            notifyIcon1.Visible = false;

        }