使用JSoup+CSSPath采集和讯网人物信息
?
代码见github
?
模型类:
?
public class Person { private String name; //基本信息 private Map<String, String> basicInfos; //教育经历 List<String> educations; //工作经历 List<String> jobs; //重要事件 List<String> importants; public String getName() { return name; } public void setName(String name) { this.name = name; } public Map<String, String> getBasicInfos() { return basicInfos; } public void setBasicInfos(Map<String, String> basicInfos) { this.basicInfos = basicInfos; } public List<String> getEducations() { return educations; } public void setEducations(List<String> educations) { this.educations = educations; } public List<String> getJobs() { return jobs; } public void setJobs(List<String> jobs) { this.jobs = jobs; } public List<String> getImportants() { return importants; } public void setImportants(List<String> importants) { this.importants = importants; } }
?
?
?
采集器:
?
package org.apdplat.demo.collect; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class PersonCollector{ private static final Logger LOG = LoggerFactory.getLogger(PersonCollector.class); private static final int PAGES = 298; public List<Person> collect() { List<Person> persons = new ArrayList<>(); try { String url = "http://renwu.hexun.com/search.aspx?z=All&Filter=All&page="; //共298页 for(int i=1; i<PAGES+1; i++){ url += i; Document document = Jsoup.connect(url).get(); String cssQuery = "html body div.wrap div.mainBox div.main div.contBox div.cont div.slistBox ul li a"; LOG.debug("cssQuery: " + cssQuery); Elements elements = document.select(cssQuery); for(Element element : elements){ try{ String personName = element.text().replace(Jsoup.parse(" ").text(), " ").replace(Jsoup.parse("?").text(), "·"); LOG.debug("人物姓名:"+personName); String href = element.attr("href"); LOG.debug("人物链接:"+href); document = Jsoup.connect(href).get(); //基本信息 String basicInfoCSSQuery = "html body div.wrap div.mainBox div.main div.setBase div.right ul li"; LOG.debug("basicInfoCSSQuery: " + basicInfoCSSQuery); Elements basicElements = document.select(basicInfoCSSQuery); Map<String, String> basicInfos = new HashMap<>(); for(Element basicElement : basicElements){ String info = basicElement.text().replace(Jsoup.parse(" ").text(), " ").replace(Jsoup.parse("?").text(), "·"); if(info != null){ String[] attrs = info.split(":");