日期:2014-05-16 浏览次数:20425 次
<%@ page import="little.util.Utils" %>
<%@ page import="org.jsoup.nodes.Element" %>
<%@ page import="org.jsoup.Jsoup" %>
<%@ page import="org.jsoup.nodes.Document" %>
<%@ page import="org.jsoup.select.Elements" %>
<%@ page import="java.util.Map" %>
<%@ page import="java.util.TreeMap" %>
<%@ page import="java.util.Iterator" %>
<%@ page contentType="text/html;charset=GBK" language="java" %>
<%
String url = "http://www.yi-see.com/art_10630_3518.html";
//设置代理访问
String html = Utils.readHtml4Get(url, true);
Document doc = Jsoup.parse(html);
//查找包含<a href="read_">内容
Elements clicks = doc.select("a[href*=read_]");
Map<Integer,String> map = new TreeMap<Integer,String>();
long startTime = System.currentTimeMillis();
//使用jsoup解析页面-页码与地址放在treemap中排序
for(Element et : clicks){
map.put(Utils.regexNum(et.text()),et.attr("href"));
}
String str = "";
StringBuffer buffer = new StringBuffer();
for(Iterator it = map.entrySet().iterator();it.hasNext();){
Map.Entry entry = (Map.Entry)it.next();
//页码,升序顺序;
Integer key = (Integer)entry.getKey();
//链接url
String value = (String)entry.getValue();
System.out.println("key="+key+">>>value="+value);
//拼出绝对链接
url = "http://www.yi-see.com/"+value;
html = Utils.readHtml4Get(url, true);
doc = Jsoup.parse(html);
//定位到<td class="ART">标签,就是文章内容
clicks = doc.select("td[class=ART]");
for(Element et : clicks){
str = et.toString();
break;
}
//过滤尾部
str = str.replace(" <div class=\"FL\">\n" +
" 请按 Ctrl+D 将本页加入书签\n" +
" </div><a href=\"feedback.php\" target=\"_blank\">提意见或您需要哪些图书的全集整理?</a><a><br /></a><a href=\"feedback.php?GJB=举报\" target=\"_blank\">举报</a><a><span class=\"mr\">【网站提示】 读者如发现作品内容与法律抵触之处,请向本站举报。 非常感谢您对易读的支持!</span><br /> </a>","");
buffer.append("++++++++++++++++ <br />");
buffer.append(" 第"+key+"页: <br />");
buffer.append("++++++++++++++++ <br />");
buffer.append(str);
}
out.print(buffer.toString());
System.out.println("共用时:" + (double)(((System.currentTimeMillis() - startTime) / 1000) / 60) + " 分钟");
%>
//\\u4E00-\\u9FA5\\uF900-\\uFA2D是指汉字的Unicode编码范围
private static final Pattern REGEX_NUM = Pattern.compile("(^[\\u4E00-\\u9FA5\\uF900-\\uFA2D])(\\d+)([\\u4E00-\\u9FA5\\uF900-\\uFA2D])?");
public static int regexNum(String str){
Matcher matcher = REGEX_NUM.matcher(str);
if(matcher.matches()){
return Integer.parseInt(matcher.group(2));
}
return 0;
}