日期:2014-05-20  浏览次数:20851 次

如何解析html
html内容如下:

GET   www.baidu.com/pub/WWW/   HTTP/1.1
Accept:   image/gif,   image/x-xbitmap,   image/jpeg,   image/pjpeg,   application/x-shockwave-flash,  

application/vnd.ms-powerpoint,   application/vnd.ms-excel,   application/msword,   */*
Accept-Language:   zh-cn
Accept-Encoding:   gzip,   deflate
User-Agent:   Mozilla/4.0   (compatible;   MSIE   6.0;   Windows   NT   5.0)
Host:   www.baidu.com
Connection:   Keep-Alive
Referer:   http://www.ntop.org/
Pragma:no-cache
Content-length:244
<html>
<head>
<title> xinxi </title>
<result> </result>
</head>
<body>
<accNo> 12312 </accNO>
</body>
</html>


------解决方案--------------------
文法分析,好像课本里的东西吧
------解决方案--------------------
import au.id.jericho.lib.html.*;
import java.util.*;
import java.io.*;
import java.net.*;

public class DisplayAllElements {
public static void main(String[] args) throws Exception {
String sourceUrlString= "c:\\test.html ";
if (sourceUrlString.indexOf( ': ')==-1) sourceUrlString= "file: "+sourceUrlString;
URL sourceUrl=new URL(sourceUrlString);
String htmlText=Util.getString(new InputStreamReader(sourceUrl.openStream()));
Source source=new Source(htmlText);
source.setLogWriter(new OutputStreamWriter(System.err));
// send log messages to stderr
for (Iterator i=source.findAllElements().iterator(); i.hasNext(); )

{
Element element=(Element)i.next();
System.out.println(element.getDebugInfo());
System.out.println(element);
}
}
}