日期:2014-05-17  浏览次数:20771 次

HtmlCleanner结合xpath用法

文章分类:Java编程


        HtmlCleaner cleaner = new HtmlCleaner();   
        
        TagNode node = cleaner.clean(new URL("http://finance.sina.com.cn/money/nmetal/20091209/10157077895.shtml"));   
        //按tag取.   
        Object[] ns = node.getElementsByName("title", true);    //标题   
        
        if(ns.length > 0) {   
            System.out.println("title="+((TagNode)ns[0]).getText());   
        }   
        // /html/body/div[2]/div[4]/div/div/div/div[2]/p
        ns = node.evaluateXPath("//div[@class=\"blkContainerSblkCon\"]/p"); //选取class为指定blkContainerSblkCon的div下面的所有p标签
        for (int i = 0; i < ns.length; i++) {
        	 String in = cleaner.getInnerHtml((TagNode)ns[i]);
             System.out.println("<p>"+in + "</p>");
		}
        String in = cleaner.getInnerHtml((TagNode)ns[0]);
        System.out.println(in);

        System.out.println(((TagNode)ns[0]).getText()); 




    ?
            HtmlCleaner cleaner = new HtmlCleaner();   
            String url = "http://finance.sina.com.cn/nmetal/hjfx.html";
            URL _url = new URL(url);
            TagNode node = cleaner.clean(_url);   
            
            //按tag取.   
            Object[] ns = node.getElementsByName("title", true);    //标题   
            
            if(ns.length > 0) { 
                System.out.println("title="+((TagNode)ns[0]).getText());   
            }  
            
            
            ns = node.evaluateXPath("//*[@class='Frame-Row3-01-C']/table[2]/tbody/tr/td/a"); //选取class为指定blkContainerSblkCon的div下面的所有p
            for (int i = 0; i < ns.length; i++) {
            	
            	//取链接文本
    //        	 String in = cleaner.getInnerHtml((TagNode)ns[i]);
    //           System.out.println(in);
            	
            	//获取链接的
            	TagNode n = (TagNode) ns[i];
    //        	System.out.println(n.getAttributeByName("href"));
            	System.out.println(new URL(_url,n.getAttributeByName("href")).toString());
    		}
    //        String in = cleaner.getInnerHtml((TagNode)ns[0]);
    //        System.out.println(in);
    
    //        System.out.println(((TagNode)ns[0]).getText());
            
    //        System.out.println("ul/li:");   
    //        //按xpath取   
    //        ns = node.evaluateXPath("//div[@class='d_1']//li");   
    //        for(Object on : ns) {   
    //            TagNode n = (TagNode) on;   
    //            System.out.println("\ttext="+n.getText());   
    //        }   
    //        System.out.println("a:");   
    //        //按属性值取   
    //        ns = node.getElementsByAttValue("name", "my_href", true, true);   
    //        for(Object on : ns) {   
    //            TagNode n = (TagNode) on;   
    //            System.out.println("\thref="+n.getAttributeByName("href")+", text="+n.getText());   
    //        }   
        
    




    XPath文档:http://www.w3school.com.cn/xpath/xpath_syntax.asp