日期:2014-05-17  浏览次数:20712 次

html抓取网页链接的例子
package function.htmlparser;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasParentFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class Test {

	public void listAll(Parser parser){
		
		try {
			NodeIterator nodeIterator=parser.elements();
			while (nodeIterator.hasMoreNodes()){
				System.out.println("+++++++++++++++++++++");
				Node node=nodeIterator.nextNode();
				System.out.println("getText():"+node.getText());
				System.out.println("getHtml():"+node.toHtml());
				
			}
		} catch (ParserException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		
	}
	
	public void filter(Parser parser){
		
		NodeList nodelist;
		
	//	NodeFilter filterL = new TagNameFilter("a");

		NodeFilter filterS = new HasAttributeFilter("class","post-title");
		
		NodeFilter filterP= new HasParentFilter(filterS);
		

		
		try {
			nodelist=parser.parse(filterP);
			//Node node=nodelist.elementAft(0);
		//	NodeFilter haf= new HasAttributeFilter("class","post-title");

//			获取相应的节点
			nodelist=nodelist.extractAllNodesThatMatch(filterP,true);
			for(int i=0;i<nodelist.size();i++){
				LinkTag link=(LinkTag)nodelist.elementAt(i).getFirstChild();
			System.out.println(link.getAttribute("href")+"/n");
				System.out.println(link.getStringText());
			
			//	System.out.println(nodelist.elementAt(i).getFirstChild().getText()+"-----"+nodelist.elementAt(i).getFirstChild().toHtml());
			}
			
			
			
			
			
			
			
			
		} catch (ParserException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		
	}

	public static void main(String[] args) {

		String urlStr="http://localhost:8080/tomfish88/error.jsp";
	Parser parser=new Parser();
	try {
		parser.setURL(urlStr);
		parser.setEncoding("gb2312");

	} catch (ParserException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
		
		Test test=new Test();
		test.filter(parser);
		
}
}?

?html文件?

?

?

?

<%@ page language="java" contentType="text/html; charset=GB18030"
    pageEncoding="GB18030"%>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=GB18030">
<title>Insert title here</title>
</head>
<body>
error!!!!!!

<table>

<tr><td>td-c1</td></tr>
		<tr class="post-title"><td><a href="http://www.fsd.com">连接1</a></td></tr>
		
		
<tr><td>td-cc1 <a href="http://www.fsd44444.com">连接3</a> </td></tr>
		<tr class="post-title"><td><a href="http://www.fsd222222.com">连接2</a></td></tr>
		
		</table>

</body>
</html>

?

?

?

java文件

?

?

?

?

1 楼 ningwuyu 2011-08-09  
jar 咋没上传啊