日期:2014-05-20 浏览次数:20914 次
/** * <p>Title: </p> * * <p>Description: </p> * * <p>Copyright: Copyright (c) 2012</p> * * <p>Company: </p> * * @author not attributable * @version 1.0 */ import java.io.*; import java.net.*; import java.util.*; import java.util.regex.*; import java.util.zip.*; public class test { //返回所有组的正则 public static ArrayDeque<String[]> regexAllGroups(String original, String regex) { int total = 0; String[] ary = null; ArrayDeque Q = new ArrayDeque(); if (original == null || regex == null) { return Q; } Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(original); while (m != null && m.find()) { //该正则在这里m.find卡死。。 total = m.groupCount(); if (total < 1) { continue; } ary = new String[total]; for (int i = 1; i <= total; i++) { ary[i - 1] = new String(m.group(i)); } Q.add(ary); } m = null; p = null; return Q; } //获取网页源码 public static String getUrlHtml(String strURL) { String body = null; String contentEncoding = null; URL _URL = null; InputStream IN = null; HttpURLConnection CONNECTION = null; try { _URL = new URL(strURL); CONNECTION = (HttpURLConnection) _URL.openConnection(); CONNECTION.setConnectTimeout(3000); CONNECTION.setReadTimeout(3000); CONNECTION.setRequestProperty("Accept-Encoding", "gzip,deflate"); CONNECTION.setRequestProperty("Accept", "*/*"); CONNECTION.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)"); CONNECTION.setRequestProperty("Connection", "close"); CONNECTION.setRequestMethod("GET"); CONNECTION.setFollowRedirects(true); CONNECTION.setUseCaches(false); CONNECTION.setInstanceFollowRedirects(true); /*判断是不是GZIP/DEFLATE压缩格式的网页*/ int type = 0; contentEncoding = CONNECTION.getContentEncoding(); if (contentEncoding != null) { contentEncoding = contentEncoding.toLowerCase(); if (contentEncoding.indexOf("gzip") != -1) { type = 1; } if (contentEncoding.indexOf("deflate") != -1) { type = 2; } } switch (type) { case 1: IN = new GZIPInputStream(CONNECTION.getInputStream()); break; case 2: IN = new InflaterInputStream(CONNECTION.getInputStream()); break; default: IN = CONNECTION.getInputStream(); break; } byte[] b = null; if (IN != null && (b = inputStreamToByte(IN)) != null) { body = new String(b, "utf-8"); IN.close(); } CONNECTION.disconnect(); b = null; } catch (Exception e) { try { if (IN != null) { IN.close(); } if (CONNECTION != null) { CONNECTION.disconnect(); } } catch (Exception ex) { } body = null; } IN = null; _URL = null; CONNECTION = null; return body; } public static byte[] inputStreamToByte(InputStream in) { if (in == null) { return null; } int ch; byte[] b = null; ByteArrayOutputStream stream = new ByteArrayOutputStream(); try { while ((ch = in.read()) != -1) { stream.write(ch); } b = stream.toByteArray(); stream.reset(); stream.close(); in.close(); } catch (Exception e) { e.printStackTrace(); } in = null; stream = null; return b; } public static void main(String[] args) { //正则表达式 String regex = "<table cellpadding=\"0\" cellspacing=\"0\" class=\"result\" id=\"\\d+\".*><tr><td class=f><h3 class=\"t\"><a.*href=\"([\\s\\S]*?)\".*target=\"_blank\">([\\s\\S]*?)</a>([\\s\\S]*?)<br>"; //获取该网页地址的html源代码 String html = getUrlHtml("http://www.baidu.com/s?wd=%D2%F8%C1%AA%B4%F3%B0%AE%BF%A8&pn=0&rn=10&usm=1"); //分析结果,在这里出现CPU资源100% ArrayDeque<String[]> Q = regexAllGroups(html, regex); } }