日期:2014-05-16 浏览次数:20406 次
选择器 |
用组件的Id |
用组件的class |
package tests; import java.io.IOException; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class EgParseItEyeNews { public static void main(String[] args) throws IOException { String url = "http://www.iteye.com/"; // 不加userAgent会被视为爬虫。。。。。 Document doc = Jsoup.connect(url) .userAgent("Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1 ") .get(); // Element news = doc.getElementById("news"); //1、通过ID Elements newsConents=doc.getElementsByClass("news_content");//2、通过class Element news=newsConents.first(); if (news == null) System.out.println(doc); else { // System.out.println(news); // System.out.println("end of news****************\n"); Elements elems = news.select("a"); // 3、通过选择器 , 把链接都提取出来 for (Element element : elems) { System.out.println(element.text() + " \t链接为:" + element.attr("href")); } } } }
Elements elems = news.select("dt>a"); // 3、通过选择器 , 把链接都提取出来