日期:2014-05-16  浏览次数:20708 次

使用JSoup+CSSPath采集和讯网人物信息

使用JSoup+CSSPath采集和讯网人物信息

?

代码见github

?

模型类:

?

public class Person {
    private String name;
    //基本信息
    private Map<String, String> basicInfos;
    //教育经历
    List<String> educations;
    //工作经历
    List<String> jobs;
    //重要事件
    List<String> importants;
    
    public String getName() {
        return name;
    }
    public void setName(String name) {
        this.name = name;
    }
    public Map<String, String> getBasicInfos() {
        return basicInfos;
    }
    public void setBasicInfos(Map<String, String> basicInfos) {
        this.basicInfos = basicInfos;
    }
    public List<String> getEducations() {
        return educations;
    }
    public void setEducations(List<String> educations) {
        this.educations = educations;
    }
    public List<String> getJobs() {
        return jobs;
    }
    public void setJobs(List<String> jobs) {
        this.jobs = jobs;
    }
    public List<String> getImportants() {
        return importants;
    }
    public void setImportants(List<String> importants) {
        this.importants = importants;
    }
}

?

?

?

采集器:

?

package org.apdplat.demo.collect;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class PersonCollector{
    private static final Logger LOG = LoggerFactory.getLogger(PersonCollector.class);
    private static final int PAGES = 298;

    public List<Person> collect() {
        List<Person> persons = new ArrayList<>();
        try {
            String url = "http://renwu.hexun.com/search.aspx?z=All&Filter=All&page=";
            //共298页
            for(int i=1; i<PAGES+1; i++){
                url += i;
                Document document = Jsoup.connect(url).get();
                String cssQuery = "html body div.wrap div.mainBox div.main div.contBox div.cont div.slistBox ul li a";
                LOG.debug("cssQuery: " + cssQuery);
                Elements elements = document.select(cssQuery);
                for(Element element : elements){
                    try{
                        String personName = element.text().replace(Jsoup.parse("&nbsp;").text(), " ").replace(Jsoup.parse("?").text(), "·");
                        LOG.debug("人物姓名:"+personName);
                        String href = element.attr("href");
                        LOG.debug("人物链接:"+href);
                        document = Jsoup.connect(href).get();
                        //基本信息
                        String basicInfoCSSQuery = "html body div.wrap div.mainBox div.main div.setBase div.right ul li";
                        LOG.debug("basicInfoCSSQuery: " + basicInfoCSSQuery);
                        Elements basicElements = document.select(basicInfoCSSQuery);
                        Map<String, String> basicInfos = new HashMap<>();
                        for(Element basicElement : basicElements){
                            String info = basicElement.text().replace(Jsoup.parse("&nbsp;").text(), " ").replace(Jsoup.parse("?").text(), "·");
                            if(info != null){
                                String[] attrs = info.split(":");