日期:2014-05-17 浏览次数:20589 次
package com.safetys.crawler.util; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.OrFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.TableTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import com.safetys.framework.exception.ApplicationAccessException; /** * 整合百度、谷歌搜索数据 * @author zhaozhi3758 * date:2011-04-19 */ public class Crawler { private final static String splitStr="zzc@cheng"; private String encoding="gbk"; //解析页面编码 public String searchMode;//指定搜索方式 keyword 按关键字搜索,specifyUrl 按指定url搜索 public String baiduUrl; //百度搜索url,按照设定的搜索链接模版,需包含${keyword}:关键字/${searchNum}:搜索数量 "http://www.baidu.com/s?rn=${searchNum}&wd=${keyword}" public String googleUrl; //google 搜索url,按照设定的搜索链接模版,需包含${keyword}:关键字/${searchNum}:搜索数量 "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=${keyword}&num=${searchNum}&aq=f&aqi=&aql=&oq=&gs_rfai=" public String keyword; //搜索关键字 public int searchNum = 0;//搜索数量 public String specifyUrl; //按指定的url 搜索 /** * 抓取百度搜索结果页面 */ public List<String> crawlerBaidu(){ Parser myParser = new Parser(); try { myParser.setURL(getBaiduUrl()); myParser.setEncoding(myParser.getEncoding()); } catch (ParserException e1) { e1.printStackTrace(); } NodeList nodeList = null; NodeFilter tableFilter = new NodeClassFilter(TableTag.class); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates(new NodeFilter[] { tableFilter }); List<String> result = new ArrayList<String>(); try { nodeList = myParser.parse(lastFilter); for (int i = 0; i <= nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); if(tag.getAttribute("id")!=null){ result.addAll(getBaiduLink(tag.getChildrenHTML())); } } } } catch (ParserException e) { e.printStackTrace(); } return result; } private List<String> getBaiduLink(String s){ Parser myParser; NodeList nodeList = null; myParser = Parser.createParser(s,encoding); List<String> result = new ArrayList<String>(); try { //设置解析编码格式 nodeList =myParser.parse (new NodeClassFilter(LinkTag.class)) ; // 使用 NodeClassFilter if (nodeList!=null && nodeList.size () > 0) { // 循环遍历每个Url 节点 for (int l = 0; l < nodeList.size () ; l ++) { String urlLink= ((LinkTag) nodeList.elementAt (l)) .extractLink () ; String LinkName = ((LinkTag) nodeList.elementAt (l)).getLinkText () ; if(!LinkName.equals("百度快照") && urlLink.indexOf("baidu")==-1 && urlLink.indexOf("http") == 0){ System.out.println("baidu--->"+LinkName + splitStr + urlLink); result.add(LinkName + splitStr + urlLink); } } } } catch (ParserException e) { e.printStackTrace () ; } return result; } /** * 抓取谷歌搜索结果页面的指定范围的链接 */ private List<String> crawlerGoogle() { String htmlstr = getUrlHtmlByHttpClient(getGoogleU