日期:2014-05-20  浏览次数:20694 次

跪求解决 !!!利用Htmlparser抓取网页正文时出错,求教大神解决!
跪求解决 !!!
利用Htmlparser抓取网页正文时出错,求教大神解决!
/**包含正文的标签通常是TABLE、DIV或ParagraphTag里,因而找到包含文字最多的DIV或TABLE,通常就是正文**/
//table有效性的记录
public class TableValid {
  private int trnum;
  private int tdnum;
  private int linknum;
  private int textnum;
  private int scriptnum;

  public int getScriptnum() {
  return scriptnum;
  }
  public void setScriptnum(int scriptnum) {
  this.scriptnum = scriptnum;
  }
  public int getLinknum() {
  return linknum;
  }
  public void setLinknum(int linknum) {
  this.linknum = linknum;
  }
  public int getTdnum() {
  return tdnum;
  }
  public void setTdnum(int tdnum) {
  this.tdnum = tdnum;
  }
  public int getTextnum() {
  return textnum;
  }
  public void setTextnum(int textnum) {
  this.textnum = textnum;
  }
  public int getTrnum() {
  return trnum;
  }
  public void setTrnum(int trnum) {
  this.trnum = trnum;
  }
}
//table中的内容


import java.util.List;

public class TableContext {
private List<?> linkList;
private StringBuffer textBuffer;
private int tableRow;
private int totalRow;
private String sign;

public String getSign() {
return sign;
}

public void setSign(String sign) {
this.sign = sign;
}

public int getTotalRow() {
return totalRow;
}

public void setTotalRow(int totalRow) {
this.totalRow = totalRow;
}

public int getTableRow() {
return tableRow;
}

public void setTableRow(int tableRow) {
this.tableRow = tableRow;
}

public List<?> getLinkList() {
return linkList;
}

public void setLinkList(List<?> linkList) {
this.linkList = linkList;
}

public StringBuffer getTextBuffer() {
return textBuffer;
}

public void setTextBuffer(StringBuffer textBuffer) {
this.textBuffer = textBuffer;
}
}



//column有效性的记录
public class TableColumnValid {
  int tdNum;
  boolean valid;
public int getTdNum() {
return tdNum;
}
public void setTdNum(int tdNum) {
this.tdNum = tdNum;
}
public boolean isValid() {
return valid;
}
public void setValid(boolean valid) {
this.valid = valid;
}
  
}


//页面内容
import org.htmlparser.Node;


public class PageContext {
private StringBuffer textBuffer;
private int number;
private Node node;

public Node getNode() {
return node;
}

public void setNode(Node node) {
this.node = node;
}

public int getNumber() {
return number;
}

public void setNumber(int number) {
this.number = number;
}

public StringBuffer getTextBuffer() {
return textBuffer;
}

public void setTextBuffer(StringBuffer textBuffer) {
this.textBuffer = textBuffer;
}
}


import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang.StringUtils;