日期:2014-05-20 浏览次数:20890 次
import java.net.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.*;
public class UrlCodeRegex
{
public static void main(String[] args)
{
try
{
String ur="http://csbbs.soufun.com/2710156784~-2~683/5236858_5236858.htm"; //获取远程网上的信息
URL MyURL=new URL(ur);
String str;
URLConnection con=MyURL.openConnection();
InputStreamReader ins=new InputStreamReader(con.getInputStream());
BufferedReader in=new BufferedReader(ins);
StringBuffer sb = new StringBuffer();
while ((str=in.readLine())!=null)
{
sb.append(str);
}
in.close();
Pattern p = Pattern.compile(".*<a href=\"(.*)\">社区</a>.*");
Matcher m = p.matcher(sb.toString());
m.matches();
System.out.println("这个社区的网址是"+m.group(1));
}
catch (MalformedURLException mfURLe) {
System.out.println("MalformedURLException: " + mfURLe);
}
catch (IOException ioe) {
System.out.println("IOException: " + ioe);
}
}
}
------解决方案--------------------
JDK自带的javax.swing.text.html.parser这个包
------解决方案--------------------
或者这个更好
/**
* 此程序是获得网页源代码中某个关键字的链接网址,
* 如<a href="http:\\www.sina.com" target="blank">新浪</a>
*/
import java.net.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.*;
public class UrlCodeRegex
{
public static void main(String[] args)
{
try
{
String ur="http://csbbs.soufun.com/2710156784~-2~683/5236858_5236858.htm"; //获取远程网上的信息
URL MyURL=new URL(ur);
String str;
URLConnection con=MyURL.openConnection();
InputStreamReader ins=new InputStreamReader(con.getInputStream());
BufferedReader in=new BufferedReader(ins);
StringBuffer sb = new StringBuffer();
while ((str=in.readLine())!=null)
{
sb.append(str);
}
in.close();
Pattern p = Pattern.compile(".*<a href=\"(http://([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)?)\".*>社区</a>.*");
Matcher m = p.matcher(sb.toString());
m.matches();
System.out.println("这个社区的网址是"+m.group(1));
}
catch (MalformedURLException mfURLe) {
System.out.println("MalformedURLException: " + mfURLe);
}
catch (IOException ioe) {
System.out.println("IOException: " + ioe);
}
}
}
------解决方案--------------------
用 javax.xml.parsers.SAXParser
自己写个 handler 就好了! 像这样
public static void main(String[] args)throws Exception{
SAXParserFactory factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(true);
SAXParser parser = factory.newSAXParser();
URL url = new URL("xxxx");//xxxx is you URL
URLConnection con = url.openConnection();
MyHandler myhandler = new MyHandler();
parser.parse(con.getInputStream(), myhandler);
System.out.println(myhandler.list);
}
class MyHandler extends DefaultHandler{
public ArrayList list = new ArrayList();
public void startElement(String s, String s1, String s2,
Attributes attributes) throws SAXException {
if(s.equals("a")||s1.equals("a")){
list.add(attributes.getValue("href"));
}
}
}