htmlparser获取<a></a>链接地址和标题
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class OrFilterForIamgeOrLinkOrFrame {
//只能获取链接和链接标题,时间无法获取
public static void main(String[] args) throws ParserException {
getImage("http://www.ahtba.org.cn/infomation/");
}
public static void getImage(String url) throws ParserException {
Parser parser = new Parser(url);
parser.setEncoding("gbk");
PrototypicalNodeFactory pnfPrototypicalNodeFactory = new PrototypicalNodeFactory();
pnfPrototypicalNodeFactory.registerTag(new Div());
parser.setNodeFactory(pnfPrototypicalNodeFactory);
NodeFilter filter1 = new NodeClassFilter(LinkTag.class);
NodeList nodelist = parser.extractAllNodesThatMatch(filter1);
for (Node node : nodelist.toNodeArray()) {
if (node instanceof LinkTag) {
LinkTag link = (LinkTag) node;
if (link != null) {
System.out.println("地址:" + link.getLink()+"\t标题:"+link.getLinkText());
}
}
}
}
}
//***************************************//
//同样可以获得标题、地址,但地址不够完整
public void getHERF(String html) {
// 创建Parser对象根据传给字符串和指定的编码
Parser parser = Parser.createParser(html, "GBK");
// 创建HtmlPage对象HtmlPage(Parser parser)
HtmlPage page = new HtmlPage(parser);
try {
// HtmlPage extends visitor,Apply the given visitor to the current
// page.
parser.visitAllNodesWith(page);
} catch (ParserException e1) {
e1 = null;
}
// 所有的节点
NodeList nodelist = page.getBody();
// 建立一个节点filter用于过滤节点
NodeFilter filter = new TagNameFilter("A");
// 得到所有过滤后,想要的节点
nodelist = nodelist.extractAllNodesThatMatch(filter, true);
for (int i = 0; i < nodelist.size(); i++) {
System.out.println("\n");
LinkTag link = (LinkTag) nodelist.elementAt(i);
// 链接地址
System.out.println(link.getAttribute("href"));
// 链接名称
System.out.println(l