日期:2014-05-17 浏览次数:20665 次
//import java.io.File; import java.io.IOException; import java.net.URL; import org.htmlcleaner.CleanerProperties; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.PrettyXmlSerializer; import org.htmlcleaner.TagNode; import org.htmlcleaner.XPatherException; //import com.sun.xml.internal.txw2.output.XmlSerializer; /** * 数据抓取 */ public class HtmlClean { @SuppressWarnings("deprecation") /** * 数据抓取 */ public void cleanHtml(String htmlurl, String xmlurl, String xpath) throws XPatherException { try { //将目标网址内容抓取下来存到本地的XML文件中(格式化) //long start = System.currentTimeMillis(); HtmlCleaner cleaner = new HtmlCleaner(); CleanerProperties props = cleaner.getProperties(); props.setUseCdataForScriptAndStyle(true); props.setRecognizeUnicodeChars(true); props.setUseEmptyElementTags(true); props.setAdvancedXmlEscape(true); props.setTranslateSpecialEntities(true); props.setBooleanAttributeValues("empty"); TagNode node = cleaner.clean(new URL(htmlurl)); // System.out.println(in); //System.out.println(((TagNode) ns[0]).getText()); //System.out.println("vreme:" + (System.currentTimeMillis() - start)); new PrettyXmlSerializer(props).writeXmlToFile(node, xmlurl);//格式化保存 String result = new PrettyXmlSerializer(props).getXmlAsString(node); //System.out.println("vreme:" + (System.currentTimeMillis() - start)); System.out.println("*********************************************************"); // // //TagNode Xmlnode = cleaner.clean(new URL(xmlurl));//从已经格式化的XML文件中取出所要的数据 TagNode Xmlnode = cleaner.clean(new String(result));//从已格式化的String中取出所要的数据 Object[] ns = Xmlnode.getElementsByName("title", true); // 标题 if (ns.length > 0) { System.out.println("title=" + ((TagNode) ns[0]).getText()); } ns = Xmlnode.evaluateXPath(xpath); // 选取class为指定dixian1的所有td标签 // for (int i = 0; i < ns.length; i++) { // String in = cleaner.getInnerHtml((TagNode) ns[i]); // System.out.println("<span>" + in + "</span>"); // } System.out.println("*********************************************************"); String in = cleaner.getInnerHtml((TagNode) ns[0]); for(int i=0 ;i<ns.length ;i++){ in = cleaner.getInnerHtml((TagNode) ns[i]); System.out.println(in); if((i+1)%8==0){ System.out.println("*********************************************************"); } } } catch (IOException e) { e.printStackTrace(); } } public static void main(String[] args) throws XPatherException { HtmlClean cleaner = new HtmlClean(); cleaner.cleanHtml("http://app.sipo.gov.cn:8080/sipo2008/searchfee/searchfee_action.jsp?sqh=01351345.1", "E://text/test.xml","//td[@class='dixian1']"); } }?