日期:2014-05-20  浏览次数:20716 次

求救 java解析html
求救,java用哪个类可以解析html里面的标签啊,例如我要解析html里面的<a href=""></a>,用哪个类啊?

------解决方案--------------------
jdom
------解决方案--------------------
htmlparse组件
或者正则
------解决方案--------------------
正则表达式
------解决方案--------------------
比如我想获得http://csbbs.soufun.com/2710156784~-2~683/5236858_5236858.htm网址中社区这个标签的联结地址,可以这样
Java code
import java.net.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.*;
public class UrlCodeRegex
{
  public static void main(String[] args)
   {
    try 
    {
        String ur="http://csbbs.soufun.com/2710156784~-2~683/5236858_5236858.htm"; //获取远程网上的信息
        URL MyURL=new URL(ur);
        String str;
        URLConnection con=MyURL.openConnection();
        InputStreamReader ins=new InputStreamReader(con.getInputStream());
        BufferedReader in=new  BufferedReader(ins);
        StringBuffer sb = new StringBuffer();
        while ((str=in.readLine())!=null)
        {  
            sb.append(str);
        }
            in.close();
            
            Pattern p = Pattern.compile(".*<a href=\"(.*)\">社区</a>.*");
            Matcher m  = p.matcher(sb.toString());
            m.matches();
            System.out.println("这个社区的网址是"+m.group(1));
     }
    catch (MalformedURLException mfURLe) {
      System.out.println("MalformedURLException: " + mfURLe);
    }
    catch (IOException ioe) {
      System.out.println("IOException: " + ioe);
    }
  }
}

------解决方案--------------------
JDK自带的javax.swing.text.html.parser这个包
------解决方案--------------------
或者这个更好
Java code
 /**
 * 此程序是获得网页源代码中某个关键字的链接网址,
 * 如<a href="http:\\www.sina.com" target="blank">新浪</a>
 */
import java.net.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.*;
public class UrlCodeRegex
{
  public static void main(String[] args)
   {
    try 
    {
        String ur="http://csbbs.soufun.com/2710156784~-2~683/5236858_5236858.htm"; //获取远程网上的信息
        URL MyURL=new URL(ur);
        String str;
        URLConnection con=MyURL.openConnection();
        InputStreamReader ins=new InputStreamReader(con.getInputStream());
        BufferedReader in=new  BufferedReader(ins);
        StringBuffer sb = new StringBuffer();
        while ((str=in.readLine())!=null)
        {  
            sb.append(str);
        }
            in.close();
            
              Pattern p = Pattern.compile(".*<a href=\"(http://([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)?)\".*>社区</a>.*");
            Matcher m  = p.matcher(sb.toString());
            m.matches();
            System.out.println("这个社区的网址是"+m.group(1));
     }
    catch (MalformedURLException mfURLe) {
      System.out.println("MalformedURLException: " + mfURLe);
    }
    catch (IOException ioe) {
      System.out.println("IOException: " + ioe);
    }
  }
}

------解决方案--------------------
用 javax.xml.parsers.SAXParser
自己写个 handler 就好了! 像这样

Java code


    public static void main(String[] args)throws Exception{

        SAXParserFactory factory = SAXParserFactory.newInstance();
        factory.setNamespaceAware(true);
        SAXParser parser = factory.newSAXParser();
        URL url = new URL("xxxx");//xxxx is you URL
        URLConnection con = url.openConnection();
        MyHandler myhandler = new MyHandler();
        parser.parse(con.getInputStream(), myhandler);
        System.out.println(myhandler.list);
    }

class MyHandler extends DefaultHandler{
    public ArrayList list = new ArrayList();
    public void startElement(String s, String s1, String s2,
            Attributes attributes) throws SAXException {
        if(s.equals("a")||s1.equals("a")){
            list.add(attributes.getValue("href"));
        }
    }
}