日期:2014-05-17  浏览次数:20664 次

用nekohtml来获取网页指定内容

用nekohtml来获取网页指定内容

下载地址 http://sourceforge.net/projects/nekohtml

package?test;

import?java.io.InputStream;
import?java.net.HttpURLConnection;
import?java.net.URL;

import?org.cyberneko.html.parsers.DOMParser;
import?org.w3c.dom.Document;
import?org.w3c.dom.Node;
import?org.xml.sax.InputSource;

public?class?Test?{
????
public?static?void?main(String[]?argv)?throws?Exception?{
??????? //指定rul
????????URL?url?=?new?URL(
????????????????
"http://topic.csdn.net/u/20090521/11/db336c07-2dbc-4732-8229-cb99fcb9d10e.html");
???????
????????HttpURLConnection?connection?=?(java.net.HttpURLConnection)url.openConnection();
????????connection.connect();
????????InputStream?stream?
=?connection.getInputStream();
????????DOMParser?parser?
=?new?DOMParser();
??????? //这行代码等同于html页面中的<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
????????parser.setProperty("http://cyberneko.org/html/properties/default-encoding","utf-8");
????????parser.parse(
new?InputSource(stream));
????????Document?doc?
=?parser.getDocument();
????????Node myNode
=?doc.getElementById("reply57194353_body