日期:2014-05-17 浏览次数:20835 次
package net.nutch.fetcher;   
...   
import org.cyberneko.html.parsers.*;   
import org.xml.sax.*;   
import org.w3c.dom.*;   
import org.w3c.dom.html.*;   
import org.apache.html.dom.*;   
/* A simple fetcher. */  
public class Fetcher {   
....   
   private DOMFragmentParser parser = new DOMFragmentParser();   
....   
   private void handleFetch(URL url, FetchListEntry fle, Http.Response response)   
     throws IOException, SAXException {   
       
     //判断HTTP应答包的类型,只放过html文件   
     String contentType = response.getHeader("Content-Type");   
     if (contentType != null && !contentType.startsWith("text/html"))   
       throw new IOException("Unknown content-type: " + contentType);   
     //创建文件片段对象   
     DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment();   
     //解析HTML内容   
     parser.parse(new InputSource(new ByteArrayInputStream(response.getContent())),node);   
     //取得全部文本内容   
     StringBuffer sb = new StringBuffer();   
     getText(sb, node);   
     String text = sb.toString();   
     //取得标题信息   
     sb.setLength(0);   
     getTitle(sb, node);   
     String title = sb.toString().trim();   
     //取得该页所有的出链   
     ArrayList l = new ArrayList();   
     getOutlinks(url, l, node);   
       
     //显示结果,存储信息   
     Outlink[] outlinks = (Outlink[])l.toArray(new Outlink[l.size()]);   
     LOG.fine("found " + outlinks.length + " outlinks in " + url);   
     outputPage(new FetcherOutput(fle, MD5Hash.digest(response.getContent()),   
                                  true, title, outlinks),   
                new FetcherContent(response.getContent()),   
                new FetcherText(text));   
   }   
  private static void getText(StringBuffer sb, Node node) {   
   if (node.getNodeType() == Node.TEXT_NODE) {   
     sb.append(node.getNodeValue());//取得结点值,即开始与结束标签之间的信息   
   }   
   NodeList children = node.getChildNodes();   
   if ( children != null ) {   
     int len = children.getLength();   
     for ( int i = 0; i < len; i++ ) {   
       getText(sb, children.item(i));//递归遍历DOM树   
     }   
   }   
  }   
  private static boolean getTitle(StringBuffer sb, Node node) {   
   if (node.getNodeType() == Node.ELEMENT_NODE) {   
     if ("title".equalsIgnoreCase(node.getNodeName())) {   
       getText(sb, node);   
       return true;   
     }   
   }   
   NodeList children = node.getChildNodes();   
   if (children != null) {   
     int len = children.getLength();   
     for (int i = 0; i < len; i++) {   
       if (getTitle(sb, children.item(i))) {   
         return true;   
       }   
     }   
   }   
   return false;   
  }   
  private static void getOutlinks(URL base, ArrayList outlinks, Node node) {   
   if (node.getNodeType() == Node.ELEMENT_NODE) {   
     if ("a".equalsIgnoreCase(node.getNodeName())) {   
       StringBuffer linkText = new StringBuffer();   
       getText(linkText, node);   
       NamedNodeMap attrs = node.getAttributes();   
       String target= null;   
       for (int i= 0; i < attrs.getLength(); i++ ) {   
         if ("href".equalsIgnoreCase(attrs.item(i).getNodeName