日期:2014-05-17 浏览次数:20648 次
package net.nutch.fetcher; ... import org.cyberneko.html.parsers.*; import org.xml.sax.*; import org.w3c.dom.*; import org.w3c.dom.html.*; import org.apache.html.dom.*; /* A simple fetcher. */ public class Fetcher { .... private DOMFragmentParser parser = new DOMFragmentParser(); .... private void handleFetch(URL url, FetchListEntry fle, Http.Response response) throws IOException, SAXException { //判断HTTP应答包的类型,只放过html文件 String contentType = response.getHeader("Content-Type"); if (contentType != null && !contentType.startsWith("text/html")) throw new IOException("Unknown content-type: " + contentType); //创建文件片段对象 DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); //解析HTML内容 parser.parse(new InputSource(new ByteArrayInputStream(response.getContent())),node); //取得全部文本内容 StringBuffer sb = new StringBuffer(); getText(sb, node); String text = sb.toString(); //取得标题信息 sb.setLength(0); getTitle(sb, node); String title = sb.toString().trim(); //取得该页所有的出链 ArrayList l = new ArrayList(); getOutlinks(url, l, node); //显示结果,存储信息 Outlink[] outlinks = (Outlink[])l.toArray(new Outlink[l.size()]); LOG.fine("found " + outlinks.length + " outlinks in " + url); outputPage(new FetcherOutput(fle, MD5Hash.digest(response.getContent()), true, title, outlinks), new FetcherContent(response.getContent()), new FetcherText(text)); } private static void getText(StringBuffer sb, Node node) { if (node.getNodeType() == Node.TEXT_NODE) { sb.append(node.getNodeValue());//取得结点值,即开始与结束标签之间的信息 } NodeList children = node.getChildNodes(); if ( children != null ) { int len = children.getLength(); for ( int i = 0; i < len; i++ ) { getText(sb, children.item(i));//递归遍历DOM树 } } } private static boolean getTitle(StringBuffer sb, Node node) { if (node.getNodeType() == Node.ELEMENT_NODE) { if ("title".equalsIgnoreCase(node.getNodeName())) { getText(sb, node); return true; } } NodeList children = node.getChildNodes(); if (children != null) { int len = children.getLength(); for (int i = 0; i < len; i++) { if (getTitle(sb, children.item(i))) { return true; } } } return false; } private static void getOutlinks(URL base, ArrayList outlinks, Node node) { if (node.getNodeType() == Node.ELEMENT_NODE) { if ("a".equalsIgnoreCase(node.getNodeName())) { StringBuffer linkText = new StringBuffer(); getText(linkText, node); NamedNodeMap attrs = node.getAttributes(); String target= null; for (int i= 0; i < attrs.getLength(); i++ ) { if ("href".equalsIgnoreCase(attrs.item(i).getNodeName