日期:2014-05-17 浏览次数:20635 次
import java.io.BufferedReader; import java.io.FileReader; import java.io.InputStreamReader; import java.net.URL; import org.cyberneko.html.parsers.DOMParser; import org.htmlparser.tags.Span; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; public class JavaTest { public static String TextExtractor(Node root){ //若是文本节点的话,直接返回 if (root.getNodeType() == Node.TEXT_NODE) { return ""; } if(root.getNodeType() == Node.ELEMENT_NODE) { Element elmt = (Element) root; //抛弃脚本 if (elmt.getTagName().equals("STYLE") || elmt.getTagName().equals("SCRIPT")) return ""; NodeList children = elmt.getChildNodes(); StringBuilder text = new StringBuilder(); if (elmt.getTagName().equals("SPAN")) { if (elmt.hasAttribute("id")) { // System.out.println(elmt.getAttribute("id")); if (elmt.getAttribute("id").equals("countOfPrd")) { return children.item(0).getNextSibling().getNextSibling().getNextSibling().getFirstChild().getNodeValue(); } } } for (int i = 0; i < children.getLength(); i++) { text.append(TextExtractor(children.item(i))); } return text.toString(); } //对其它类型的节点,返回空值 return ""; } public static void main(String[] args) throws Exception{ //生成html parser DOMParser parser = new DOMParser(); //设置网页的默认编码 parser.setProperty( "http://cyberneko.org/html/properties/default-encoding", "utf-8"); //input file URL a = new URL("http://www.suning.cn/webapp/wcs/stores/servlet/prd_10052_10051_-7_9173_196583_.html"); BufferedReader in = new BufferedReader(new InputStreamReader(a.openStream())); //BufferedReader in = new BufferedReader(new FileReader("input.htm")); parser.parse(new InputSource(in)); Document doc = parser.getDocument(); //获得body节点,以此为根,计算其文本内容 Node body = doc.getElementsByTagName("BODY").item(0); System.out.println(TextExtractor(body)); } } ?//该段代码是用来解析抓取各大网站的价格,来智能化定价。实现价格战的目的赢得用户,不懂或更多探究QQ526151410