日期:2014-05-17 浏览次数:20690 次
package com.web.test; import java.io.BufferedReader; import java.io.InputStreamReader; import java.net.URL; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeList; /** * JAVA中使用Htmlparse解析HTML文档,使用htmlparse遍历出HTML文档的所有超链接(<a>标记)。 * * @author YYmmiinngg */ public class ReadHTML2 { public static void main(String[] args) { try { //1.网页HTML String strUrl = "http://www.boc.cn/finadata/lilv/"; URL url = new URL(strUrl); InputStreamReader isr = new InputStreamReader(url.openStream()); BufferedReader br = new BufferedReader(isr); String htmlString = ""; //2.本地HTML // File f=new File("fortest.htm"); //输入流 // InputStreamReader isr1=new InputStreamReader(new FileInputStream(f)); // BufferedReader br=new BufferedReader(isr1); //获取html转换成String String s; String allContent = ""; while ((s = br.readLine()) != null) { allContent = allContent + s; } //使用后HTML Parser 控件 Parser myParser = Parser.createParser(allContent, "utf-8"); try { // 通过过滤器过滤出<A>标签 NodeList nodeList = myParser .extractAllNodesThatMatch(new NodeFilter() { //实现该方法,用以过滤标签 public boolean accept(Node node) { if (node instanceof LinkTag) //<A>标记 return true; return false; } }); // 打印 for (int i = 0; i < nodeList.size(); i++) { LinkTag n = (LinkTag) nodeList.elementAt(i); System.out.print(n.getStringText() + " ==>> "); System.out.println(n.extractLink()); } } catch (Exception e) { e.printStackTrace(); } } catch (Exception e) { e.printStackTrace(); } } }