日期:2014-05-17  浏览次数:20744 次

利用htmlparser解析网页的标题、keywords和Description

网页中的标题、keywords和Description这三项对于建立网页索引是非常重要的。以下是用htmlparser对网页进行解析来得到这三个值。

try {
			NodeFilter title_filter = new TagNameFilter("title");
			NodeFilter meta_filter = new TagNameFilter("meta");
			OrFilter filters = new OrFilter();
			filters.setPredicates(new NodeFilter[]{title_filter,meta_filter});
			Parser parser = new Parser();
			parser.setURL("D:\\test.html");
			parser.setEncoding(parser.getEncoding());
			NodeList list = parser.extractAllNodesThatMatch(filters);
			for (int i = 0; i < list.size(); i++) {
				Tag tag=(Tag) list.elementAt(i);
				
				if(tag instanceof MetaTag){
					String name=tag.getAttribute("name");
					if(name!=null&&name.equalsIgnoreCase("Keywords")){
						System.out.println("Keywords : "+tag.getAttribute("content"));
					}
					if(name!=null&&name.equalsIgnoreCase("Description")){
						System.out.println("Description"+" : "+tag.getAttribute("content"));
					}					
				}else if(tag instanceof TitleTag){
					System.out.println("Title : "+tag.getText());
				}
			}
			} catch (Exception e) {
			e.printStackTrace();
			}

?