日期:2014-05-16  浏览次数:20365 次

使用jsoup解析一篇文章示例
jar包下载地址:http://jsoup.org/download
<%@ page import="little.util.Utils" %>
<%@ page import="org.jsoup.nodes.Element" %>
<%@ page import="org.jsoup.Jsoup" %>
<%@ page import="org.jsoup.nodes.Document" %>
<%@ page import="org.jsoup.select.Elements" %>
<%@ page import="java.util.Map" %>
<%@ page import="java.util.TreeMap" %>
<%@ page import="java.util.Iterator" %>
<%@ page contentType="text/html;charset=GBK" language="java" %>

<%
    String url = "http://www.yi-see.com/art_10630_3518.html";
    //设置代理访问
    String html = Utils.readHtml4Get(url, true);
    Document doc = Jsoup.parse(html);
	//查找包含<a href="read_">内容
    Elements clicks = doc.select("a[href*=read_]");
    Map<Integer,String> map = new TreeMap<Integer,String>();
    long startTime = System.currentTimeMillis();
    //使用jsoup解析页面-页码与地址放在treemap中排序
    for(Element et : clicks){
        map.put(Utils.regexNum(et.text()),et.attr("href"));
    }
    String str = "";
    StringBuffer buffer = new StringBuffer();
    for(Iterator it = map.entrySet().iterator();it.hasNext();){
        Map.Entry entry = (Map.Entry)it.next();
		//页码,升序顺序;
        Integer key = (Integer)entry.getKey();
		//链接url
        String value = (String)entry.getValue();
        System.out.println("key="+key+">>>value="+value);
		//拼出绝对链接
        url = "http://www.yi-see.com/"+value;
        html = Utils.readHtml4Get(url, true);
        doc = Jsoup.parse(html);
		//定位到<td class="ART">标签,就是文章内容
        clicks = doc.select("td[class=ART]");
        for(Element et : clicks){
            str = et.toString();
            break;
        }
        //过滤尾部
        str = str.replace(" <div class=\"FL\">\n" +
                "   请按 Ctrl+D 将本页加入书签\n" +
                " </div><a href=\"feedback.php\" target=\"_blank\">提意见或您需要哪些图书的全集整理?</a><a><br /></a><a href=\"feedback.php?GJB=举报\" target=\"_blank\">举报</a><a><span class=\"mr\">【网站提示】 读者如发现作品内容与法律抵触之处,请向本站举报。 非常感谢您对易读的支持!</span><br />&nbsp;</a>","");
        buffer.append("++++++++++++++++ <br />");
        buffer.append("    第"+key+"页:   <br />");
        buffer.append("++++++++++++++++ <br />");
        buffer.append(str);
    }
    out.print(buffer.toString());
    System.out.println("共用时:" + (double)(((System.currentTimeMillis() - startTime) / 1000) / 60) + " 分钟");
%>



Utils方法:
    //\\u4E00-\\u9FA5\\uF900-\\uFA2D是指汉字的Unicode编码范围
    private static final Pattern REGEX_NUM = Pattern.compile("(^[\\u4E00-\\u9FA5\\uF900-\\uFA2D])(\\d+)([\\u4E00-\\u9FA5\\uF900-\\uFA2D])?");
    public static int regexNum(String str){
        Matcher matcher = REGEX_NUM.matcher(str);
        if(matcher.matches()){
            return Integer.parseInt(matcher.group(2));
        }
        return 0;
    }