日期:2014-05-16 浏览次数:21076 次
def scanningHotArticle(url):
print url
request=requests.get(url)
dom=soupparser.fromstring(request.content)
body=dom[1]
articleList=body.xpath("//div[@class='block untagged mb15 bs2']")
for article in articleList:
articleStr= etree.tostring(article)
articleBody=soupparser.fromstring(articleStr)
print len(articleBody.xpath("//div[@class='detail']"))>>> import lxml.etree as etree >>> html = '<html><body id="1">abc<div>123</div>def<div>456</div>ghi</body></html>' >>> dom = etree.fromstring(html) >>> etree.tostring(dom) '<html><body id="1">abc<div>123</div>def<div>456</div>ghi</body></html>'
>>> import lxml.html.soupparser as soupparser >>> dom = soupparser.fromstring(html) >>> etree.tostring(dom) '<html><body id="1">abc<div>123</div>def<div>456</div>ghi</body></html>'
>>> len(dom) 1
>>> dom[0].tag 'body'
>>> for child in dom: ... print child.tag ... body
>>>body = dom[0] >>> dom.index(body) 0
>>> body.getparent().tag 'html'
>>> for ele in dom.iter(): ... print ele.tag ... html body div div
>>> children = list(root) >>> for child in root: ... print(child.tag)
>>> root[0] is root[1].getprevious() # lxml.etree only! True >>> root[1] is root[0].getnext() # lxml.etree only! True
>>> body.get('id')
'1'>>> attrs = body.attrib
>>> attrs.get('id')
'1'>>> root = etree.Element("root", interesting="totally")
>>> etree.tostring(root)
b’<root interesting="totally"/>’&