日期:2014-05-20 浏览次数:20909 次
[align=left] package com.lee.test; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; /** * * @author Lee * 本来有很多想完善呢 其他的好像都是功夫问题了 * */ public class GetImagesFromWeb { // 默认图片大小至少为1k private static long size = 1 ; // 默认图片存放位置 private static File folder = null ; // 默认文件扩展名 private static List<String> exts= new ArrayList<String>() ; // 是否从一个网页上抓取 private static boolean isSingle = true ; // 文件计数器 private static long counter = 0 ; // 文件名前缀 private static String fileNameSuffix = "default_filename_suffix_" ; static{ String path = "C:\\Documents and Settings\\Administrator\\桌面\\MyImagesFolder" ; folder = new File(path) ; if(!folder.exists()){ folder.mkdir() ; } exts.add("jpeg"); exts.add("jpg") ; exts.add("gif") ; } private static String getExtName(String url){ return url.substring(url.lastIndexOf(".")+1) ; } private static List<String> getImageUrls(String url){ URL u = null; boolean flag = false ; try { u = new URL(url); } catch (MalformedURLException e) { System.out.println(url+" 不合法!"); flag = true ; } if(flag) return null; List<String> urls = new ArrayList<String>() ; URLConnection connection = null; try { connection = u.openConnection(); } catch (IOException e) { System.out.println("网络连接错误!"); flag =true ; } if(flag) return null; BufferedReader br = null; try { br = new BufferedReader(new InputStreamReader(connection.getInputStream())); } catch (IOException e) { System.out.println("IO设备错误"); flag =true ; } if(flag) return null; String line = null ; try { while((line = br.readLine() ) != null ){ while(line.contains("<img")){ int imgIndex = line.indexOf("<img") ; int first = line.indexOf("\"", imgIndex) ; if(first == -1 ) continue ; int second = line.indexOf("\"", first+1); if(second == -1 ) continue ; String t = line.substring(first+1,second) ; // url 中可能传递参数 if(t.indexOf('?') > -1) t = t.substring(0, t.indexOf('?')) ; urls.add(t) ; line = line.substring(second) ; } } } catch (IOException e) { System.out.println("流读写错误"); flag = true ; } return urls ; } public static void getImagesFromSinglePage(String url) { URL u = null;; InputStream is = null ; FileOutputStream fos = null ; List<String> urls = getImageUrls(url) ; if(urls.size() < 1 ) return ; boolean flag = false ; for(String ur : urls ){ flag = false ; try { u = new URL(ur) ; } catch (MalformedURLException e) { System.out.println(ur+ "不合法!"); flag = true ; } if(flag) continue ; URLConnection connection = null; try { connection = u.openConnection(); } catch (IOException e) { System.out.println("IO 错误!"); flag =true ; } if(flag) continue ; try { is = connection.getInputStream() ; } catch (IOException e) { System.out.println("IO 错误!"); flag = true ; } if(flag) continue ; File file = new File(folder,fileNameSuffix+( counter++)+"."+getExtName(ur)) ; if(!file.exists()) try { file.createNewFile() ; } catch (IOException e) { System.out.println("建立文件"+file.getAbsolutePath()+" 失败!"); flag = true ; } if(flag) continue ; try { fos = new FileOutputStream(file) ; } catch (FileNotFoundException e) { System.out.println("文件 "+file.getAbsolutePath()+"不存在!"); flag =true ; } if(flag) continue ; byte[] b = new byte[1024] ; int len = 0 ; try { while((len = is.read(b, 0, 1024)) > 0 ){ fos.write(b, 0, len) ; } fos.flush() ; } catch (IOException e) { System.out.println("IO错误!"); } System.out.println(file.getName()+" 获取成功!"); } try { if(fos != null ){ fos.close() ; fos =null ; } if(is != null ){ is.close() ; is =null ; } } catch (IOException e) { e.printStackTrace(); } } public static void batchGetImages(String urlFirst,String urlLast,int beginIndex , int endIndex){ for(int i = beginIndex ; i <= endIndex ; i++ ){ getImagesFromSinglePage(urlFirst+i+urlLast) ; } } public static void main(String[] args) { // 获取一个网页的所有图片 //getImagesFromSinglePage("http://www.qiushibaike.com/new2/pic/20/page/6/") ; // 获取地址有数字特征规律的所有网页的图片 // 以网址 http://www.qiushibaike.com/new2/pic/20/page/350/ 为例 // 数字350 代表第350页 如果是1则代表的是第一页 // 下面获取第一页到第三十页的所有图片 // 大概获取600 张图片 batchGetImages("http://www.qiushibaike.com/new2/pic/20/page/","/", 1, 30) ; } } [/align]