日期:2014-05-17  浏览次数:20741 次

lucene如何抽取html网页
要解析html页面 就要对html中的标签做处理

先准备几个工具类
package com.cs.parser.util;


import org.htmlparser.Node;

public class PageContent {
	
    private StringBuffer textBuffer;
    private int number;
    private Node node;

    public Node getNode() {
		return node;
	}

	public void setNode(Node node) {
		this.node = node;
	}

	public int getNumber() {
        return number;
    }

    public void setNumber(int number) {
        this.number = number;
    }

    public StringBuffer getTextBuffer() {
        return textBuffer;
    }

    public void setTextBuffer(StringBuffer textBuffer) {
        this.textBuffer = textBuffer;
    }
}


package com.cs.parser.util;



public class TableValid {
    private int trnum;
    private int tdnum;
    private int linknum;
    private int textnum;
    private int scriptnum;

    public int getScriptnum() {
        return scriptnum;
    }

    public void setScriptnum(int scriptnum) {
        this.scriptnum = scriptnum;
    }

    public int getLinknum() {
        return linknum;
    }

    public void setLinknum(int linknum) {
        this.linknum = linknum;
    }

    public int getTdnum() {
        return tdnum;
    }

    public void setTdnum(int tdnum) {
        this.tdnum = tdnum;
    }

    public int getTextnum() {
        return textnum;
    }

    public void setTextnum(int textnum) {
        this.textnum = textnum;
    }

    public int getTrnum() {
        return trnum;
    }

    public void setTrnum(int trnum) {
        this.trnum = trnum;
    }
}


package com.cs.parser.util;

public class TableColumnValid {
    int tdNum;
    boolean valid;
	public int getTdNum() {
		return tdNum;
	}
	public void setTdNum(int tdNum) {
		this.tdNum = tdNum;
	}
	public boolean isValid() {
		return valid;
	}
	public void setValid(boolean valid) {
		this.valid = valid;
	}

  
}


接下来看看如何解析html页面
加入htmlparser.jar包
package com.cs;

public interface Parsable {
	
	public String getTitle() ;
	public String getContent()  ;
	public String getSummary()  ;
}

package com.cs;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.Div;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.ParagraphTag;
import org.htmlparser.tags.ScriptTag;
import org.htmlparser.tags.SelectTag;
import org.htmlparser.tags.Span;
import org.htmlparser.tags.StyleTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableHeader;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

import com.cs.parser.util.PageContent;
import com.cs.parser.util.TableColumnValid;
import com.cs.parser.util.TableValid;
 

public class EasyHtmlParser implements Parsable {
	
	 protected static final String lineSign = System.getProperty(
     "line.separator");
	 protected static final int lineSign_size = lineSign.length();

	
	private File file ;
	
	private String content ;
	private String summary ;
	private String title ;
	
	
	public static void main(String[] args) {
		EasyHtmlParser eParser = new EasyHtmlParser(new File("E:\\EclipseStudyWorkspace\\LuceneParse\\fileSource\\www.htm")) ;
		System.out.println("html content : "+ePar