日期:2014-05-17  浏览次数:20673 次

Python抓取页面中超链接(URL)的3中方法比较(HTMLParser、pyquery、正则表达式)
HTMLParser版:
# !/usr/bin/python
#
-*- coding: UTF-8 -*-

import HTMLParser

class UrlParser(HTMLParser.HTMLParser):
????def __init__ (self):
????????HTMLParser.HTMLParser.__init__ (self)
????????self.urls = []
????def handle_starttag(self, tag, attrs):
????????if tag == ' a ' :
????????????for name,value in attrs:
????????????????if name == ' href ' :
????????????????????self.urls.append(value)
????def geturls(self):
????????return self.urls

if __name__ == ' __main__ ' :
????urls = []
????url = UrlParser()
????url.feed(' 1111111111<a href="http://www.bccn.net">BCCN</a>2222222<a href="http://bbs.bccn.net">BCCN.BBS</a>333333333 ' )
????urls += url.geturls()
????print urls
?
pyquery版:
# !/usr/bin/python
#
-*- coding: UTF-8 -*-

from pyquery import PyQuery as pq

class UrlParser():
????def __init__ (self):
????????self.urls = []
????def feed(self,data):
????????d = pq(data)
????????if d.find(' a ' ):
????????????# 关于下面一行,我用d('a').attr('href')只能得到第一个URL,暂时只会用map,不知道有没有别的够pythonic的代码
????????????url = d(' a ' ).map(lambda i, e: pq(e)(' a ' ).attr(' href ' ))
????????????for u in url:
????????????????self.urls.append(u)
????def geturls(self):
????????retu