日期:2014-05-20  浏览次数:20936 次

正则高手帮看看那里出现问题CPU 100%
Java code

/**
 * <p>Title: </p>
 *
 * <p>Description: </p>
 *
 * <p>Copyright: Copyright (c) 2012</p>
 *
 * <p>Company: </p>
 *
 * @author not attributable
 * @version 1.0
 */
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
import java.util.zip.*;

public class test {
    //返回所有组的正则
    public static ArrayDeque<String[]> regexAllGroups(String original, String regex) {
        int total = 0;
        String[] ary = null;
        ArrayDeque Q = new ArrayDeque();
        if (original == null || regex == null) {
            return Q;
        }
        Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
        Matcher m = p.matcher(original);
        while (m != null && m.find()) { //该正则在这里m.find卡死。。
            total = m.groupCount();
            if (total < 1) {
                continue;
            }
            ary = new String[total];
            for (int i = 1; i <= total; i++) {
                ary[i - 1] = new String(m.group(i));
            }
            Q.add(ary);
        }

        m = null;
        p = null;
        return Q;
    }
    
    //获取网页源码
    public static String getUrlHtml(String strURL) {
        String body = null;
        String contentEncoding = null;
        URL _URL = null;
        InputStream IN = null;
        HttpURLConnection CONNECTION = null;

        try {
            _URL = new URL(strURL);
            CONNECTION = (HttpURLConnection) _URL.openConnection();
            CONNECTION.setConnectTimeout(3000);
            CONNECTION.setReadTimeout(3000);
            CONNECTION.setRequestProperty("Accept-Encoding", "gzip,deflate");
            CONNECTION.setRequestProperty("Accept", "*/*");
            CONNECTION.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)");
            CONNECTION.setRequestProperty("Connection", "close");
            CONNECTION.setRequestMethod("GET");
            CONNECTION.setFollowRedirects(true);
            CONNECTION.setUseCaches(false);
            CONNECTION.setInstanceFollowRedirects(true);
            /*判断是不是GZIP/DEFLATE压缩格式的网页*/
            int type = 0;
            contentEncoding = CONNECTION.getContentEncoding();
            if (contentEncoding != null) {
                contentEncoding = contentEncoding.toLowerCase();
                if (contentEncoding.indexOf("gzip") != -1) {
                    type = 1;
                }
                if (contentEncoding.indexOf("deflate") != -1) {
                    type = 2;
                }
            }

            switch (type) {
            case 1:
                IN = new GZIPInputStream(CONNECTION.getInputStream());
                break;
            case 2:
                IN = new InflaterInputStream(CONNECTION.getInputStream());
                break;
            default:
                IN = CONNECTION.getInputStream();
                break;
            }

            byte[] b = null;

            if (IN != null && (b = inputStreamToByte(IN)) != null) {
                body = new String(b, "utf-8");
                IN.close();
            }
            CONNECTION.disconnect();
            b = null;
        } catch (Exception e) {

            try {
                if (IN != null) {
                    IN.close();
                }
                if (CONNECTION != null) {
                    CONNECTION.disconnect();
                }
            } catch (Exception ex) {

            }
            body = null;
        }

        IN = null;
        _URL = null;

        CONNECTION = null;
        return body;
    }
    
    
    public static byte[] inputStreamToByte(InputStream in) {

       if (in == null) {
           return null;
       }
       int ch;
       byte[] b = null;
       ByteArrayOutputStream stream = new ByteArrayOutputStream();
       try {
           while ((ch = in.read()) != -1) {
               stream.write(ch);
           }
           b = stream.toByteArray();
           stream.reset();
           stream.close();
           in.close();
       } catch (Exception e) {
           e.printStackTrace();
       }
       in = null;
       stream = null;
       return b;
   }



    public static void main(String[] args) {
        //正则表达式
        String regex = "<table cellpadding=\"0\" cellspacing=\"0\" class=\"result\" id=\"\\d+\".*><tr><td class=f><h3 class=\"t\"><a.*href=\"([\\s\\S]*?)\".*target=\"_blank\">([\\s\\S]*?)</a>([\\s\\S]*?)<br>";
        //获取该网页地址的html源代码
        String html = getUrlHtml("http://www.baidu.com/s?wd=%D2%F8%C1%AA%B4%F3%B0%AE%BF%A8&pn=0&rn=10&usm=1");
        //分析结果,在这里出现CPU资源100%
        ArrayDeque<String[]> Q = regexAllGroups(html, regex);
        

    }

}