日期:2014-05-16 浏览次数:20963 次
def scanningHotArticle(url): print url request=requests.get(url) dom=soupparser.fromstring(request.content) body=dom[1] articleList=body.xpath("//div[@class='block untagged mb15 bs2']") for article in articleList: articleStr= etree.tostring(article) articleBody=soupparser.fromstring(articleStr) print len(articleBody.xpath("//div[@class='detail']"))
>>> import lxml.etree as etree >>> html = '<html><body id="1">abc<div>123</div>def<div>456</div>ghi</body></html>' >>> dom = etree.fromstring(html) >>> etree.tostring(dom) '<html><body id="1">abc<div>123</div>def<div>456</div>ghi</body></html>'
>>> import lxml.html.soupparser as soupparser >>> dom = soupparser.fromstring(html) >>> etree.tostring(dom) '<html><body id="1">abc<div>123</div>def<div>456</div>ghi</body></html>'
>>> len(dom) 1
>>> dom[0].tag 'body'
>>> for child in dom: ... print child.tag ... body
>>>body = dom[0] >>> dom.index(body) 0
>>> body.getparent().tag 'html'
>>> for ele in dom.iter(): ... print ele.tag ... html body div div
>>> children = list(root) >>> for child in root: ... print(child.tag)
>>> root[0] is root[1].getprevious() # lxml.etree only! True >>> root[1] is root[0].getnext() # lxml.etree only! True
>>> body.get('id') '1'
>>> attrs = body.attrib >>> attrs.get('id') '1'
>>> root = etree.Element("root", interesting="totally") >>> etree.tostring(root) b’<root interesting="totally"/>’
&