日期:2014-05-17 浏览次数:20932 次
public class ParseWork { private Parser parser; public ParseWork(String htmlAddress) throws ParserException { parser = new Parser(htmlAddress); } /** * 获取网页标题和正文组成的文本 * **/ protected String getText(String elementId) throws ParserException{ NodeFilter TitleFilter = new NodeClassFilter(TitleTag.class); NodeFilter ElementIdFilter = new HasAttributeFilter("id", elementId); OrFilter orFilter = new OrFilter(TitleFilter, ElementIdFilter); //做一个逻辑OR Filter组合 NodeList list = parser.extractAllNodesThatMatch(orFilter); StringBuffer text = new StringBuffer(); for (int i = 0; i < list.size(); i++) text = text.append(list.elementAt(i).toPlainTextString() + "\r\n"); return text.toString().trim(); } public static void main(String[] args) throws ParserException, IOException { ParseWork p = new ParseWork("E://JavaEye新闻.htm"); String mainText = p.getText("news_content"); //写网页正文文件 FileUtils.writeStringToFile(new File("E://javaeye新闻.txt"), mainText, "utf-8"); //摘要 } }