日期:2014-05-17 浏览次数:20779 次
/**
* 返回网页中所有URL
* @return type:NodeList
*/
public static NodeList getAllUrl(String Url) throws Exception {
//使用htmlparser获取
Parser parser = new Parser();
parser.setResource(Url);
//待定的编码格式
parser.setEncoding("gbk");
//遍历所有节点 自定义内部类(自定义过滤器)
NodeList nodeList = parser.extractAllNodesThatMatch(new NodeFilter() {
private static final long serialVersionUID = 1L;
public boolean accept(Node node) {
//判断node是否是LinkTag的一个实例
if (node instanceof LinkTag)
return true;
else{
return false;
}
}
});
return nodeList;
}
/*
* 返回新闻内容
*/
public static String getContent(String urlpath){
Parser parser = new Parser();
String content = "";
try {
parser.setResource(urlpath);//传入url
NodeFilter divFilter = new NodeClassFilter(Div.class);//自定义过滤器
NodeList divlist = parser.parse(divFilter);//加载过滤器
for(int i=0;i<divlist.size();i++){
Node node = divlist.elementAt(i); //遍历所有div标签
if (node.getText().toString().equals("div id='zoom'")){
content = node.toPlainTextString(); //得到指定div内容
};
}
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return content;
}
}
/*
*返回新闻title
*
*/
public static String getTitle(String urlpath)throws Exception{
Parser parser = new Parser();
parser.setResource(urlpath);
parser.setEncoding("gbk");
NodeFilter titleFilter = new NodeClassFilter(TitleTag.class);
NodeList titleList = parser.parse(titleFilter);
String title = "";
for (int i = 0; i < titleList.size(); i++) {
Node node = titleList.elementAt(i);
if (node instanceof TitleTag) {
TitleTag titleTage = (TitleTag) node;
title = titleTage.getStringText();
}
}
return title;
}