日期:2014-05-17  浏览次数:20856 次

利用python脚本抓取AC的代码[爬虫+HTMLParser+handle_entityref+正则表达式+模拟登陆+文件操作]
#-*- coding=utf-8 -*-
import time,urllib2,urllib,re,HTMLParser,os
from htmlentitydefs import entitydefs


class PageParser(HTMLParser.HTMLParser):#翻译实体
    def __init__(self):
        self.data=""
        self.readcode=0
        HTMLParser.HTMLParser.__init__(self)
    def handle_starttag(self,tag,attrs):
        if tag=='textarea':
            self.readcode=1
    def handle_data(self,data):
        if self.readcode:
            self.data+=data
    def handle_endtag(self,tag):
        if tag=='textarea':
            self.readcode=0
    def handle_entityref(self,name):
        if entitydefs.has_key(name):
            self.handle_data(entitydefs[name])
        else:
            self.handle_data('&'+name+';')
    def getdata(self):
        return self.data

global res
def getACUrl():
    step=1
    r=re.compile(unicode("下一页","utf8"))
    #.{500}?\"
    r2=re.compile("<input type=\"hidden\" name=\"__VIEWSTATE\" id=\"__VIEWSTATE\" value=\".{1,50000}?\" />")
    url = "http://algorithm.fzu.edu.cn/OnlineJudgeUserStatus.aspx"
    parms = {
        '__EVENTTARGET':'ctl00$MainRightHolder$UserStatusGridView',
        'ctl00$MainRightHolder$UserIdTextBox':'120320050',
        }
    login1=urllib2.urlopen(url)
    pagedata=login1.read()
    s1=r2.findall(pagedata)
    if len(s1)==0:
        return
    parms['__VIEWSTATE']=s1[0][64:-4]#form表单的提取
    while True:
        try:
            if step==1:
                parms['__EVENTARGUMENT']='Page$First'
            else:
                parms['__EVENTARGUMENT']='Page$Next'
            step+=1
            login = urllib2.urlopen(url,urllib.urlencode(parms))
            data=(unicode(login.read(),"utf8"))
            #            fout=open("c:\\1.html","wb+")
            #            fout.write(data.encode("GBK"))
            #            fout.close()
            findurl(data)
            if len(r.findall(data))==0:
                break
            s1=r2.findall(data)
            if len(s1)==0:
                return
            parms['__VIEWSTATE']=s1[0][64:-4]
            data=""
        except Exception,e:
            print(e)
            break
def Login(username,password):#模拟登陆
    try:
        cookies = urllib2.HTTPCookieProcessor()
        opener = urllib2.build_opener(cookies)
        urllib2.install_opener(opener)
        parms = {
            '__VIEWSTATE':r'/wEPDwULLTE2ODk5MTAyOTUPZBYCAgMPZBYCAgUPEA8WAh4LXyFEYXRhQm91bmRnZBAVBgzmnIDmlrDkv6Hmga8kMTLmnIg15pel566X5rOV6K++5YGc5LiK5LiA5qyh77yM6K++EuS8mOengOS9nOS4muWAmemAiSgg5YWz5LqO6aKY55uu55qE5pe26Ze056m66Ze06ZmQ5Yi255qE6ZeuJOWFs+S6jueZu+mZhuezu+e7n+eUqOaIt+WQjeS4juWvhueggQg+PuabtOWkmhUGABdTaG93QnVsbGV0aW4uYXNweD9iaWQ9NRdTaG93QnVsbGV0aW4uYXNweD9iaWQ9NBdTaG93QnVsbGV0aW4uYXNweD9iaWQ9MxdTaG93QnVsbGV0aW4uYXNweD9iaWQ9MhFCdWxsZXRpbkxpc3QuYXNweBQrAwZnZ2dnZ2cWAGQYAgUeX19Db250cm9sc1JlcXVpcmVQb3N0QmFja0tleV9fFgEFEkJhbm5lciRMb2dpbkJ1dHRvbgUgQmFubmVyJFVzZXJDb250cm9sUGFuZWxNdWx0aVZpZXcPD2RmZOAvQzwaH/EzyqdrNO7IO2UefuMIHdnWhg02m4yXus4K',
            'Banner$LoginButton.x':'17',
            'Banner$LoginButton.y':'5'
        }
        parms[r"Banner$UserNameText"]=username
        parms[r"Banner$Password"]=password

        loginUrl = "http://algorithm.fzu.edu.cn/Default.aspx"
        login = urllib2.urlopen(loginUrl,urllib.urlencode(parms))
        h=(unicode(login.read(),"utf8"))
        # loginer = urllib2.urlopen("http://poj.org/")#登录主页
        #  print(loginer.read().decode("utf8"))
    except Exception,e:
        print(e)
def findurl(data):
    r=re.compile("<a class=\"underline\" href=\".{1,500}?\" target=\"_blank\">.{1,500}?</a></td><td><a class=\"hover-underline\"