日期:2014-05-20 浏览次数:20717 次
private List<String> parsePage_samair(String html){
List<String> ret = new ArrayList<String>();
SpiderProxyBase.baseLog("Enter parsePage_samair");
String regEx = "<tr><td>(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})<script.+?[^>]>:(\\d{1,5})</td><td>(.+?)[^<]</td><td>(.+?)[^<]</td><td>(.+?)[^<]</td></tr>";
Pattern pat = Pattern.compile(regEx,Pattern.DOTALL); //取消Pattern.DOTALL标记问题也在
Matcher mat = pat.matcher(html);
while(mat.find()){
StringBuffer result = new StringBuffer();
System.out.println(mat.group(1)); //41.65.38.227
System.out.println(mat.group(2)); //3128
System.out.println(mat.group(3)); //anonymous(这里相对html代码缺一空格)
System.out.println(mat.group(4)); //Feb-17, 06:4缺一字母
System.out.println(mat.group(5)); //Egyp缺一字母
result.append(mat.group(1)).append(";");
result.append(mat.group(2)).append(";");
result.append(mat.group(3)).append(";");
result.append(mat.group(4)).append(";");
result.append(mat.group(5));
ret.add(result.toString());
System.out.println(ret.get(ret.size() -1 ));
}
SpiderProxyBase.baseLog("Exit parsePage_samair");
return ret;
}
String regEx = "<tr><td>(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})<script.+?[^>]>:(\\d{1,5})</td><td>(.+?)</td><td>(.+?)</td><td>(.+?)</td></tr>";