日期:2014-05-17 浏览次数:20741 次
package com.cs.parser.util; import org.htmlparser.Node; public class PageContent { private StringBuffer textBuffer; private int number; private Node node; public Node getNode() { return node; } public void setNode(Node node) { this.node = node; } public int getNumber() { return number; } public void setNumber(int number) { this.number = number; } public StringBuffer getTextBuffer() { return textBuffer; } public void setTextBuffer(StringBuffer textBuffer) { this.textBuffer = textBuffer; } }
package com.cs.parser.util; public class TableValid { private int trnum; private int tdnum; private int linknum; private int textnum; private int scriptnum; public int getScriptnum() { return scriptnum; } public void setScriptnum(int scriptnum) { this.scriptnum = scriptnum; } public int getLinknum() { return linknum; } public void setLinknum(int linknum) { this.linknum = linknum; } public int getTdnum() { return tdnum; } public void setTdnum(int tdnum) { this.tdnum = tdnum; } public int getTextnum() { return textnum; } public void setTextnum(int textnum) { this.textnum = textnum; } public int getTrnum() { return trnum; } public void setTrnum(int trnum) { this.trnum = trnum; } }
package com.cs.parser.util; public class TableColumnValid { int tdNum; boolean valid; public int getTdNum() { return tdNum; } public void setTdNum(int tdNum) { this.tdNum = tdNum; } public boolean isValid() { return valid; } public void setValid(boolean valid) { this.valid = valid; } }
package com.cs; public interface Parsable { public String getTitle() ; public String getContent() ; public String getSummary() ; }
package com.cs; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.nodes.TagNode; import org.htmlparser.nodes.TextNode; import org.htmlparser.tags.Div; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.ParagraphTag; import org.htmlparser.tags.ScriptTag; import org.htmlparser.tags.SelectTag; import org.htmlparser.tags.Span; import org.htmlparser.tags.StyleTag; import org.htmlparser.tags.TableColumn; import org.htmlparser.tags.TableHeader; import org.htmlparser.tags.TableRow; import org.htmlparser.tags.TableTag; import org.htmlparser.tags.TitleTag; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import com.cs.parser.util.PageContent; import com.cs.parser.util.TableColumnValid; import com.cs.parser.util.TableValid; public class EasyHtmlParser implements Parsable { protected static final String lineSign = System.getProperty( "line.separator"); protected static final int lineSign_size = lineSign.length(); private File file ; private String content ; private String summary ; private String title ; public static void main(String[] args) { EasyHtmlParser eParser = new EasyHtmlParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\www.htm")) ; System.out.println("html content : "+ePar