日期:2014-05-18  浏览次数:20923 次

采集ASPX网站翻页的数据(dopostback方式)

采集数据一般两种方式:一种采集HttpWebRequest 的POST方法有人说可以实现,但是我试了很久没有办法实现,翻不了页,我找不到办法带入__VIEWSTATE参数,太长,如果哪位实现了,也可以给个方法解决

我现在采集第二种:用WebBrowser来采。这里存在一个问题,就是dopostback是AJAX形势,我不知道他什么时候完成。当然可以用TIMER来等持 ,但是我用的是线程方式创建WebBrowser,WebBrowser在DocumentCompleted以外来读取“web1[intThreadBH].Document.Body.InnerHtml”会提示 函数求值超时。不知道哪位有经验的朋友可以指点指点。代码如下:
C# code

        WebBrowser[] web1=new WebBrowser[99];
        bool[] m_boolWeb = new bool[99];
        string[] html = new string[99];
        private void Form1_Load(object sender, EventArgs e)
        {

            Thread oThread = new Thread(new ParameterizedThreadStart(StartAspxTaskSearch));

            oThread.IsBackground = true;
            oThread.Priority = ThreadPriority.Lowest;
            oThread.ApartmentState = ApartmentState.STA; 
            oThread.Start(1);

        }
        private void StartAspxTaskSearch(object str)
        {

            try
            {
                html[1] = "";
                this.Invoke(new EventHandler(this.invokeTest));
              web1[1].DocumentCompleted += new System.Windows.Forms.WebBrowserDocumentCompletedEventHandler(web1_DocumentCompleted);

            web1[1].Name = "1";
            web1[1].Navigate("http://xxxx.com/InfoList.aspx?type=es");
 
 
            while (!M_boolWeb[1])
            {
//在这里没有办法读取web1[intThreadBH].Document.Body.InnerHtml,i不然可以用HTML里面的判断是否翻页了
                Thread.Sleep(800);
 
            }
 
            MessageBox.Show(html[1]);
            }
            catch(Exception ex)
            {
            }
            
        }
        private void web1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
        {

            int intThreadBH = Convert.ToInt32(((WebBrowser)sender).Name);
            web1[intThreadBH].Document.All["ctl00$ContentPlaceHolder1$GridView1$ctl38$txtPage"].SetAttribute("value", "8");//8是变量

             web1[intThreadBH].Document.InvokeScript("__doPostBack", new object[] { "ctl00$ContentPlaceHolder1$GridView1$ctl38$linkGo", "" });
            while (web1[intThreadBH].StatusText != "完成")
            {
               // timer1.Enabled = false;

            }
            html[intThreadBH] = web1[intThreadBH].Document.Body.InnerHtml;
            m_boolWeb[intThreadBH] = true;
 
        }
        public void invokeTest(object sender, EventArgs e)
        {
            web1[1] = new WebBrowser();
            this.Controls.Add(web1[1]);
        } 



------解决方案--------------------
string postData = "";
postData += "page=2";

byte[] data = System.Text.Encoding.UTF8.GetBytes(postData);
HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create("");
myRequest.Method = "POST";
myRequest.ContentType = "application/x-www-form-urlencoded";
myRequest.ContentLength = data.Length;
System.IO.Stream newStream = myRequest.GetRequestStream();
newStream.Write(data, 0, data.Length);
newStream.Close();
HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
System.IO.StreamReader reader = new System.IO.StreamReader(myResponse.GetResponseStream(), System.Text.Encoding.UTF8);
string content=reader.ReadToEnd();
textBox1.Text = content;
webbrower通过执行按钮操作实现
------解决方案--------------------

------解决方案--------------------
通过这种方式分页的您还是放弃采集算了!
------解决方案--------------------