日期:2014-05-17  浏览次数:20540 次

HtmlParser 解析搜索页面
package com.safetys.crawler.util;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TableTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import com.safetys.framework.exception.ApplicationAccessException;
/**
 * 整合百度、谷歌搜索数据
 * @author zhaozhi3758
 * date:2011-04-19
 */
public class Crawler {
	
	
	private final static String splitStr="zzc@cheng";
	private String encoding="gbk"; //解析页面编码
	public String searchMode;//指定搜索方式 keyword 按关键字搜索,specifyUrl 按指定url搜索
	public String baiduUrl; //百度搜索url,按照设定的搜索链接模版,需包含${keyword}:关键字/${searchNum}:搜索数量 "http://www.baidu.com/s?rn=${searchNum}&wd=${keyword}"
	public String googleUrl; //google 搜索url,按照设定的搜索链接模版,需包含${keyword}:关键字/${searchNum}:搜索数量 "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=${keyword}&num=${searchNum}&aq=f&aqi=&aql=&oq=&gs_rfai="
	public String keyword; //搜索关键字
	public int searchNum = 0;//搜索数量
	public String specifyUrl; //按指定的url 搜索
	
	/**
	 * 抓取百度搜索结果页面
	 */
	public  List<String> crawlerBaidu(){
	    Parser myParser = new Parser();  
	    try {
			myParser.setURL(getBaiduUrl());
			myParser.setEncoding(myParser.getEncoding());
		} catch (ParserException e1) {
			e1.printStackTrace();
		}  
	    NodeList nodeList = null;  
	    NodeFilter tableFilter = new NodeClassFilter(TableTag.class);  
	    OrFilter lastFilter = new OrFilter();  
	    lastFilter.setPredicates(new NodeFilter[] { tableFilter });  
	    List<String> result = new ArrayList<String>();
	    try {  
	      nodeList = myParser.parse(lastFilter);  
	      for (int i = 0; i <= nodeList.size(); i++) {
	    		if (nodeList.elementAt(i) instanceof TableTag) {
	    			TableTag tag = (TableTag) nodeList.elementAt(i);
	    			if(tag.getAttribute("id")!=null){
	    				result.addAll(getBaiduLink(tag.getChildrenHTML()));
	    			}
	    		}
	    	}
	    } catch (ParserException e) {  
	      e.printStackTrace();  
	    }  
	    return result;
   }
	
   private  List<String> getBaiduLink(String s){
		Parser myParser;  
	     NodeList nodeList = null;  
	     myParser = Parser.createParser(s,encoding);
	     List<String> result = new ArrayList<String>();
		  try {
			  //设置解析编码格式
			  nodeList =myParser.parse (new NodeClassFilter(LinkTag.class)) ; // 使用 NodeClassFilter
			  if (nodeList!=null && nodeList.size () > 0) {
				 // 循环遍历每个Url 节点
				 for (int l = 0; l < nodeList.size () ; l ++) {
					String urlLink= ((LinkTag) nodeList.elementAt (l)) .extractLink () ;
					String LinkName = ((LinkTag) nodeList.elementAt (l)).getLinkText () ;
					if(!LinkName.equals("百度快照") && urlLink.indexOf("baidu")==-1 && urlLink.indexOf("http") == 0){
						System.out.println("baidu--->"+LinkName + splitStr + urlLink);
						result.add(LinkName + splitStr + urlLink);
					}
				 }
			  }
		 } catch (ParserException e) {
			  e.printStackTrace () ;
		 }
		 return result;
	}
   
   
	/**
	 * 抓取谷歌搜索结果页面的指定范围的链接
	 */
   private  List<String> crawlerGoogle() {   
	   String htmlstr = getUrlHtmlByHttpClient(getGoogleU