日期:2014-05-20 浏览次数:20898 次
public static void main(String[] args) throws Exception { String[] urls = { "http://mil.news.sina.com.cn/2012-04-10/0428687123.html", "http://mil.news.sina.com.cn/2012-04-12/0731687387.html", "http://news.sina.com.cn/c/2012-04-13/044224264609.shtml" }; final Pattern titlePattern = Pattern .compile("<h1 id=\"artibodyTitle\".*?>(.*?)</h1>"); final Pattern wordCountPattern = Pattern.compile("\u515a|\u56fd\u5bb6"); for (final String url : urls) { new Thread() { public void run() { BufferedReader reader = null; try { reader = new BufferedReader(new InputStreamReader( new URL(url).openStream(), "GB2312")); String line; String title = null; int[] count = new int[2]; while ((line = reader.readLine()) != null) { if (title == null) { Matcher titleMatcher = titlePattern.matcher(line); if (titleMatcher.find()) { title = titleMatcher.group(1); } } Matcher wordCountMatcher = wordCountPattern.matcher(line); while (wordCountMatcher.find()) { String word = wordCountMatcher.group(); count[word.length() >> 1]++; } } if (count[0] > count[1]) { throw new RuntimeException( String.format("%s[%s] \u515a:%d > \u56fd\u5bb6:%d", title, url, count[0], count[1])); } System.out.printf("%s[%s] is good!", title, url); } catch (IOException ex) { ex.printStackTrace(); } finally { if (reader != null) { try { reader.close(); reader = null; } catch (Exception ex) { } } } } }.start(); } }