日期:2014-05-17 浏览次数:20712 次
package function.htmlparser; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.HasParentFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class Test { public void listAll(Parser parser){ try { NodeIterator nodeIterator=parser.elements(); while (nodeIterator.hasMoreNodes()){ System.out.println("+++++++++++++++++++++"); Node node=nodeIterator.nextNode(); System.out.println("getText():"+node.getText()); System.out.println("getHtml():"+node.toHtml()); } } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public void filter(Parser parser){ NodeList nodelist; // NodeFilter filterL = new TagNameFilter("a"); NodeFilter filterS = new HasAttributeFilter("class","post-title"); NodeFilter filterP= new HasParentFilter(filterS); try { nodelist=parser.parse(filterP); //Node node=nodelist.elementAft(0); // NodeFilter haf= new HasAttributeFilter("class","post-title"); // 获取相应的节点 nodelist=nodelist.extractAllNodesThatMatch(filterP,true); for(int i=0;i<nodelist.size();i++){ LinkTag link=(LinkTag)nodelist.elementAt(i).getFirstChild(); System.out.println(link.getAttribute("href")+"/n"); System.out.println(link.getStringText()); // System.out.println(nodelist.elementAt(i).getFirstChild().getText()+"-----"+nodelist.elementAt(i).getFirstChild().toHtml()); } } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void main(String[] args) { String urlStr="http://localhost:8080/tomfish88/error.jsp"; Parser parser=new Parser(); try { parser.setURL(urlStr); parser.setEncoding("gb2312"); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } Test test=new Test(); test.filter(parser); } }?
?html文件?
?
?
?
<%@ page language="java" contentType="text/html; charset=GB18030" pageEncoding="GB18030"%> <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=GB18030"> <title>Insert title here</title> </head> <body> error!!!!!! <table> <tr><td>td-c1</td></tr> <tr class="post-title"><td><a href="http://www.fsd.com">连接1</a></td></tr> <tr><td>td-cc1 <a href="http://www.fsd44444.com">连接3</a> </td></tr> <tr class="post-title"><td><a href="http://www.fsd222222.com">连接2</a></td></tr> </table> </body> </html>
?
?
?
java文件
?
?
?
?