日期:2014-05-16 浏览次数:20322 次
jsoup下载地址?http://www.jsoup.org
httpclient下载地址?http://hc.apache.org/downloads.cgi
其他jar包见附件
package jsoup; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.Map; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.params.HttpProtocolParams; import org.apache.http.util.EntityUtils; import com.google.api.translate.Language; import com.google.api.translate.Translate; /** * google logo 下载程序 */ public abstract class Crawler { /** * 使用google 翻译api * * @param en * @return */ public String translateEnToCinese(String en) { Translate.setHttpReferrer("http://www.xxx.com"); try { return Translate.execute(en, Language.ENGLISH, Language.CHINESE); } catch (Exception e) { e.printStackTrace(); } return ""; } /** * 获取一个Map * * @return */ public Map<String, Object> getMap() { return new HashMap<String, Object>(0); } /** * 下载文件 * * @param url * 文件http地址 * @param dir * 目标文件 * @throws IOException */ public void downloadFile(String url, String dir) throws Exception { DefaultHttpClient httpClient = new DefaultHttpClient(); HttpProtocolParams.setUserAgent(httpClient.getParams(), "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9"); HttpGet httpGet = new HttpGet(); httpGet.setURI(new java.net.URI(url)); InputStream input = null; FileOutputStream output = null; try { HttpResponse response = httpClient.execute(httpGet); HttpEntity entity = response.getEntity(); input = entity.getContent(); File file = new File(dir); output = FileUtils.openOutputStream(file); IOUtils.copy(input, output); } catch (Exception e){ e.printStackTrace(); } finally { IOUtils.closeQuietly(output); IOUtils.closeQuietly(input); } } /** * 处理GET请求,返回整个页面 * * @param url * 访问地址 * @param params * 编码参数 * @return * @throws Exception */ public synchronized String doGet(String url, String... params) throws Exception { DefaultHttpClient httpClient = new DefaultHttpClient(); // 创建httpClient实例 HttpProtocolParams.setUserAgent(httpClient.getParams(), "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.9) Gecko/20100315 Firefox/3.5.9"); String charset = "UTF-8"; if (null != params && params.length >= 1) { charset = params[0]; } HttpGet httpGet = new HttpGet(); // 创建get方法实例 String content = ""; httpGet.setURI(new java.net.URI(url)); try { HttpResponse response = httpClient.execute(httpGet); // 执行请求,得到response对象 int resStatu = response.getStatusLine().getStatusCode(); // 得到返回的状态码 if (resStatu == HttpStatus.SC_OK) { // 200正常 HttpEntity entity = response.getEntity(); // 获得相应的实体 if (entity != null) { // 使用EntityUtils的toString方法,传递默认编码,在EntityUtils中的默认编码是ISO-8859-1 content = EntityUtils.toString(entity, charset); } } } catch (Exception e) { System.out.println("访问【" + url + "】出现异常!"); e.printStackTrace(); } finally { // 关闭资源 httpGet.abort(); httpClient.getConnectionManager().shutdown(); } return content; } }
?
?
package jsoup; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Map; import org.apache.commons.io.FileU