日期:2014-05-17  浏览次数:20675 次

htmlparser获取<a></a>链接地址和标题
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

public class OrFilterForIamgeOrLinkOrFrame {
//只能获取链接和链接标题,时间无法获取
public static void main(String[] args) throws ParserException {
getImage("http://www.ahtba.org.cn/infomation/");
}

public static void getImage(String url) throws ParserException {
Parser parser = new Parser(url);
parser.setEncoding("gbk");
PrototypicalNodeFactory pnfPrototypicalNodeFactory = new PrototypicalNodeFactory();
pnfPrototypicalNodeFactory.registerTag(new Div());
parser.setNodeFactory(pnfPrototypicalNodeFactory);

NodeFilter filter1 = new NodeClassFilter(LinkTag.class);
NodeList nodelist = parser.extractAllNodesThatMatch(filter1);
for (Node node : nodelist.toNodeArray()) { 
if (node instanceof LinkTag) {
LinkTag link = (LinkTag) node;
if (link != null) {
System.out.println("地址:" + link.getLink()+"\t标题:"+link.getLinkText());
}
}
}
}
}

//***************************************//
//同样可以获得标题、地址,但地址不够完整
    public void getHERF(String html) {  
 
        // 创建Parser对象根据传给字符串和指定的编码  
       Parser parser = Parser.createParser(html, "GBK");  
       // 创建HtmlPage对象HtmlPage(Parser parser)  
        HtmlPage page = new HtmlPage(parser);  
        try {  
            // HtmlPage extends visitor,Apply the given visitor to the current  
            // page.  
            parser.visitAllNodesWith(page);  
        } catch (ParserException e1) {  
           e1 = null;  
        }  
        // 所有的节点  
       NodeList nodelist = page.getBody();  
        // 建立一个节点filter用于过滤节点  
        NodeFilter filter = new TagNameFilter("A");  
        // 得到所有过滤后,想要的节点  
        nodelist = nodelist.extractAllNodesThatMatch(filter, true);  
       for (int i = 0; i < nodelist.size(); i++) { 
       System.out.println("\n");
            LinkTag link = (LinkTag) nodelist.elementAt(i);  
            // 链接地址  
            System.out.println(link.getAttribute("href"));  
            // 链接名称  
            System.out.println(l