日期:2014-05-20 浏览次数:20716 次
import java.net.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.io.*; public class UrlCodeRegex { public static void main(String[] args) { try { String ur="http://csbbs.soufun.com/2710156784~-2~683/5236858_5236858.htm"; //获取远程网上的信息 URL MyURL=new URL(ur); String str; URLConnection con=MyURL.openConnection(); InputStreamReader ins=new InputStreamReader(con.getInputStream()); BufferedReader in=new BufferedReader(ins); StringBuffer sb = new StringBuffer(); while ((str=in.readLine())!=null) { sb.append(str); } in.close(); Pattern p = Pattern.compile(".*<a href=\"(.*)\">社区</a>.*"); Matcher m = p.matcher(sb.toString()); m.matches(); System.out.println("这个社区的网址是"+m.group(1)); } catch (MalformedURLException mfURLe) { System.out.println("MalformedURLException: " + mfURLe); } catch (IOException ioe) { System.out.println("IOException: " + ioe); } } }
------解决方案--------------------
JDK自带的javax.swing.text.html.parser这个包
------解决方案--------------------
或者这个更好
/** * 此程序是获得网页源代码中某个关键字的链接网址, * 如<a href="http:\\www.sina.com" target="blank">新浪</a> */ import java.net.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.io.*; public class UrlCodeRegex { public static void main(String[] args) { try { String ur="http://csbbs.soufun.com/2710156784~-2~683/5236858_5236858.htm"; //获取远程网上的信息 URL MyURL=new URL(ur); String str; URLConnection con=MyURL.openConnection(); InputStreamReader ins=new InputStreamReader(con.getInputStream()); BufferedReader in=new BufferedReader(ins); StringBuffer sb = new StringBuffer(); while ((str=in.readLine())!=null) { sb.append(str); } in.close(); Pattern p = Pattern.compile(".*<a href=\"(http://([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)?)\".*>社区</a>.*"); Matcher m = p.matcher(sb.toString()); m.matches(); System.out.println("这个社区的网址是"+m.group(1)); } catch (MalformedURLException mfURLe) { System.out.println("MalformedURLException: " + mfURLe); } catch (IOException ioe) { System.out.println("IOException: " + ioe); } } }
------解决方案--------------------
用 javax.xml.parsers.SAXParser
自己写个 handler 就好了! 像这样
public static void main(String[] args)throws Exception{ SAXParserFactory factory = SAXParserFactory.newInstance(); factory.setNamespaceAware(true); SAXParser parser = factory.newSAXParser(); URL url = new URL("xxxx");//xxxx is you URL URLConnection con = url.openConnection(); MyHandler myhandler = new MyHandler(); parser.parse(con.getInputStream(), myhandler); System.out.println(myhandler.list); } class MyHandler extends DefaultHandler{ public ArrayList list = new ArrayList(); public void startElement(String s, String s1, String s2, Attributes attributes) throws SAXException { if(s.equals("a")||s1.equals("a")){ list.add(attributes.getValue("href")); } } }