日期:2014-05-17 浏览次数:20742 次
<tr>
<td align="center">
<ul class="ul1">
<li>本站主数据:广东省广州市 电信</li>
<li>参考数据一:广东省 电信</li>
</ul>
</td>
</tr>
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class IpGetter {
public static void main(String[] args) throws IOException {
IpGetter ipgetter = new IpGetter();
String info = new IpGetter().getInfo("113.96.121.110");
System.out.println("不规则结果:" + info);
ArrayList<String> addressarray=IpGetter.getAddresses(info);
System.out.println("规范化结果:"+addressarray.toString());
}
/**
* 取出html源码中<ul class=\"ul1\">和</center>之间的部分,即为我们要的结果(不规则字符串)
*
* @return
* @throws IOException
*/
public String getInfo(String ip) throws IOException {
String info = fetchHtml("http://www.ip138.com/ips138.asp?ip=" + ip
+ "&action=2", "gb2312");
int index1 = info.indexOf("<ul class=\"ul1\">") + 16;
int index2 = info.indexOf("</ul></td>");
info = info.substring(index1, index2);
return info;
}
/**
* 从不规则字符串中(getInfo()的结果),抽取出地址信息数组
*
* @param info
*/
public static ArrayList<String> getAddresses(String info) {
ArrayList<String> addressarray=new ArrayList<String>();
String regex = ":(.*?)</li>";
Pattern p = Pattern.compile(regex);
Matcher matcher = p.matcher(info);
while(matcher.find())
{
addressarray.add(matcher.group(1));
}
return addressarray;
}
/**
* 抓取某个网页的源代码
*
* @param urlstr
* 要抓取网页的地址
* @param charset
* 网页所使用的编码 如"utf-8","gbk"
&nb