日期:2014-05-17  浏览次数:20748 次

htmlparser 去除html标签体(获取body,title纯文本)

package test;

import java.io.*;

import org.htmlparser.Parser;
import org.htmlparser.visitors.HtmlPage;

class Test {
public static void main(String[] argv) throws IOException, InterruptedException {
?? Parser parser;
????? String body = "";
????? String title = "";
?????
?? try {
??? parser = new Parser("http://www.hao123.com");
????? parser.setEncoding("UTF-8");
????? HtmlPage htmlpage = new HtmlPage(parser);
????? parser.visitAllNodesWith(htmlpage);
????? //通过htmlparser 获取body内容
????? body = htmlpage.getBody().asString();
????? //通过htmlparser 获取title内容
????? title =htmlpage.getTitle();
????? body = body.replaceAll("[ \\t\\n\\r\\f( |gt) ]+"," ");
????? System.out.println(title);
????? System.out.println(body);
?? } catch (Exception e) {
??? // TODO: handle exception
??? e.printStackTrace();
?? }
}
}

?

//获取源文件

?? Parser parser = new Parser(address);
?? parser.setEncoding("gbk");
?? System.out.println(parser.parse(null).toHtml());