日期:2014-05-16  浏览次数:20322 次

jsoup httpclient 爬取网页并下载google图标

jsoup下载地址?http://www.jsoup.org

httpclient下载地址?http://hc.apache.org/downloads.cgi

其他jar包见附件

package jsoup;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.HttpProtocolParams;
import org.apache.http.util.EntityUtils;

import com.google.api.translate.Language;
import com.google.api.translate.Translate;

/**
 * google logo 下载程序
 */
public abstract class Crawler {

	/**
	 * 使用google 翻译api
	 * 
	 * @param en
	 * @return
	 */
	public String translateEnToCinese(String en) {
		Translate.setHttpReferrer("http://www.xxx.com");
		try {
			return Translate.execute(en, Language.ENGLISH, Language.CHINESE);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return "";
	}

	/**
	 * 获取一个Map
	 * 
	 * @return
	 */
	public Map<String, Object> getMap() {
		return new HashMap<String, Object>(0);
	}

	/**
	 * 下载文件
	 * 
	 * @param url
	 *            文件http地址
	 * @param dir
	 *            目标文件
	 * @throws IOException
	 */
	public void downloadFile(String url, String dir) throws Exception {
		DefaultHttpClient httpClient = new DefaultHttpClient();
		HttpProtocolParams.setUserAgent(httpClient.getParams(),
						"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");
		HttpGet httpGet = new HttpGet();
		httpGet.setURI(new java.net.URI(url));
		
		InputStream input = null;
		FileOutputStream output = null;
		try {
			HttpResponse response = httpClient.execute(httpGet);
			HttpEntity entity = response.getEntity();
			input = entity.getContent();
			File file = new File(dir);
			output = FileUtils.openOutputStream(file);
			IOUtils.copy(input, output);
		} catch (Exception e){
			e.printStackTrace();
		} finally {
			IOUtils.closeQuietly(output);
			IOUtils.closeQuietly(input);
		}
	}

	/**
	 * 处理GET请求,返回整个页面
	 * 
	 * @param url
	 *            访问地址
	 * @param params
	 *            编码参数
	 * @return
	 * @throws Exception
	 */
	public synchronized String doGet(String url, String... params)
			throws Exception {
		DefaultHttpClient httpClient = new DefaultHttpClient(); // 创建httpClient实例
		HttpProtocolParams.setUserAgent(httpClient.getParams(),
						"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9");
		String charset = "UTF-8";
		if (null != params && params.length >= 1) {
			charset = params[0];
		}
		HttpGet httpGet = new HttpGet(); // 创建get方法实例
		String content = "";
		httpGet.setURI(new java.net.URI(url));
		try {
			HttpResponse response = httpClient.execute(httpGet); // 执行请求,得到response对象
			int resStatu = response.getStatusLine().getStatusCode(); // 得到返回的状态码
			if (resStatu == HttpStatus.SC_OK) { // 200正常
				HttpEntity entity = response.getEntity(); // 获得相应的实体
				if (entity != null) {
					// 使用EntityUtils的toString方法,传递默认编码,在EntityUtils中的默认编码是ISO-8859-1
					content = EntityUtils.toString(entity, charset);
				}
			}
		} catch (Exception e) {
			System.out.println("访问【" + url + "】出现异常!");
			e.printStackTrace();
		} finally {
			// 关闭资源
			httpGet.abort();
			httpClient.getConnectionManager().shutdown();
		}
		return content;
	}
}

?

?

package jsoup;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;

import org.apache.commons.io.FileU