日期:2014-05-16  浏览次数:20297 次

使用Jsoup抽取数据
Jsoup是一个Java的HTML解析器,提供了非常方便的抽取和操作HTML文档方法,可以结合DOM,CSS和Jquery类似的方法来定位和得到节点的信息。
有着和Jquery一样强大的select和pipeline的API。
我们以从58同城网抽取租房信息为例,来说明如何使用它:
package test

import org.jsoup.nodes.Document
import java.util.HashMap
import org.jsoup.Jsoup
/**
 * Author: fuliang
 * http://fuliang.iteye.com
 */
class HouseEntry(var title: String,var link: String,var price: Integer, var houseType: String, var date: String){
	override def toString(): String = {
		return String.format("title: %s\tlink:%s\tprice:%d\thouseType:%s\tdate:%s\n", title,link,price,houseType,date);
	}
}

class HouseRentCrawler{
	def crawl(url: String,keyword: String,lowRange: Int,highRange: Int): List[HouseEntry] = {
		var doc = fetch(url,keyword,lowRange,highRange);
		return extract(doc);
	}

	private def fetch(url:String,keyword: String,lowRange: Int,highRange: Int): Document = {
		var params = new HashMap[String,String]();
		params.put("final","1");
		params.put("jump","2");
		params.put("searchtype","3");
		params.put("key",keyword);
		params.put("MinPrice",lowRange + "_" + highRange);
		
	    return Jsoup.connect(url).data(params)
									.userAgent("Mozilla")
									.timeout(10000)
									.get();
	}
	
	private def extract(doc: Document):  List[HouseEntry] = {
		val elements = doc.select("#infolist > tr:not(.dev)");
		var houseEntries = List[HouseEntry]();
		for(val i <- 0 until elements.size()){
			val entry = elements.get(i);
			val fields = entry.select("td"); 
			val title = fields.get(0).text();
			val link = fields.get(0).select("a[class=t]").attr("href");
			val price = fields.get(1).text().toInt;
			val houseType = fields.get(2).text();
			val date = fields.get(3).text();
			val houseEntry = new HouseEntry(title,link,price,houseType,date);
			houseEntries ::= houseEntry;
		}
		return houseEntries;
	}
}

object HouseRentCrawler{
	def main(args: Array[String]) {
		val url = "http://bj.58.com/zufang";
		val crawler = new HouseRentCrawler();
		val houseEntries = crawler.crawl(url,"智学苑",2000,3500);
		for(val entry <- houseEntries){
			println(entry);
		}
	}
}

Selector overview

    * tagname: find elements by tag, e.g. a
    * ns|tag: find elements by tag in a namespace, e.g. fb|name finds <fb:name> elements
    * #id: find elements by ID, e.g. #logo
    * .class: find elements by class name, e.g. .masthead
    * [attribute]: elements with attribute, e.g. [href]
    * [^attr]: elements with an attribute name prefix, e.g. [^data-] finds elements with HTML5 dataset attributes
    * [attr=value]: elements with attribute value, e.g. [width=500]
    * [attr^=value], [attr$=value], [attr*=value]: elements with attributes that start with, end with, or contain the value, e.g. [href*=/path/]
    * [attr~=regex]: elements with attribute values that match the regular expression; e.g. img[src~=(?i)\.(png|jpe?g)]
    * *: all elements, e.g. *

Selector combinations

    * el#id: elements with ID, e.g. div#logo
    * el.class: elements with class, e.g. div.masthead
    * el[attr]: elements with attribute, e.g. a[href]
    * Any combination, e.g. a[href].highlight
    * ancestor child: child elements that descend from ancestor, e.g. .body p finds p elements anywhere under a block with class "body"
    * parent > child: child elements that descend directly from parent, e.g. div.content > p finds p elements; and body > * finds the direct children of the body tag
    * siblingA + siblingB: finds sibling B element immediately preceded by sibling A, e.g. div.head + div
    * siblingA ~ siblingX: finds sibling