日期:2014-05-17 浏览次数:20622 次
用BeautifulSoup解析时要注意在有汉字的网页中编码解码问题,下面是获取大街网网页中class='jobInfo'的div标签的数据内容
from bs4 import BeautifulSoup import urllib2 c = urllib2.urlopen('http://job.dajie.com/7262fae6-a1aa-4674-9efa-3baf697faa46.html') soup = BeautifulSoup(c.read()) for div in soup.find_all('div'): if div.get('class') == ['jobInfo']: print 'find it' #print div.contents s = div.contents for x in s: if (x.encode('GB2312')) != '<br/>' and (x.encode('GB2312')) != '\n': #注意此处GB2312编码不是utf8 print x.encode('GB2312') break