跪求解决 !!!利用Htmlparser抓取网页正文时出错,求教大神解决!
跪求解决 !!!
利用Htmlparser抓取网页正文时出错,求教大神解决!
/**包含正文的标签通常是TABLE、DIV或ParagraphTag里,因而找到包含文字最多的DIV或TABLE,通常就是正文**/
//table有效性的记录
public class TableValid {
private int trnum;
private int tdnum;
private int linknum;
private int textnum;
private int scriptnum;
public int getScriptnum() {
return scriptnum;
}
public void setScriptnum(int scriptnum) {
this.scriptnum = scriptnum;
}
public int getLinknum() {
return linknum;
}
public void setLinknum(int linknum) {
this.linknum = linknum;
}
public int getTdnum() {
return tdnum;
}
public void setTdnum(int tdnum) {
this.tdnum = tdnum;
}
public int getTextnum() {
return textnum;
}
public void setTextnum(int textnum) {
this.textnum = textnum;
}
public int getTrnum() {
return trnum;
}
public void setTrnum(int trnum) {
this.trnum = trnum;
}
}
//table中的内容
import java.util.List;
public class TableContext {
private List<?> linkList;
private StringBuffer textBuffer;
private int tableRow;
private int totalRow;
private String sign;
public String getSign() {
return sign;
}
public void setSign(String sign) {
this.sign = sign;
}
public int getTotalRow() {
return totalRow;
}
public void setTotalRow(int totalRow) {
this.totalRow = totalRow;
}
public int getTableRow() {
return tableRow;
}
public void setTableRow(int tableRow) {
this.tableRow = tableRow;
}
public List<?> getLinkList() {
return linkList;
}
public void setLinkList(List<?> linkList) {
this.linkList = linkList;
}
public StringBuffer getTextBuffer() {
return textBuffer;
}
public void setTextBuffer(StringBuffer textBuffer) {
this.textBuffer = textBuffer;
}
}
//column有效性的记录
public class TableColumnValid {
int tdNum;
boolean valid;
public int getTdNum() {
return tdNum;
}
public void setTdNum(int tdNum) {
this.tdNum = tdNum;
}
public boolean isValid() {
return valid;
}
public void setValid(boolean valid) {
this.valid = valid;
}
}
//页面内容
import org.htmlparser.Node;
public class PageContext {
private StringBuffer textBuffer;
private int number;
private Node node;
public Node getNode() {
return node;
}
public void setNode(Node node) {
this.node = node;
}
public int getNumber() {
return number;
}
public void setNumber(int number) {
this.number = number;
}
public StringBuffer getTextBuffer() {
return textBuffer;
}
public void setTextBuffer(StringBuffer textBuffer) {
this.textBuffer = textBuffer;
}
}
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;