日期:2014-05-17 浏览次数:20829 次
package com.ansj.sun.pojo;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class AnsjPaser {
private String beginRegex;
private String endRegex;
private Matcher matcher;
public final static String TEXTTEGEX = ".*?";
public final static String W = "\\W*?" ;
public final static String N = "" ;
private List<String> filterRegexList = new ArrayList<String>();
/**
*
* @param beginRegex 起始正则
* @param endRegex 结束正则
* @param content 需要解析的正文(如果没有此项必须为rest设置)
* @param textRegex 其实和结束正则中间的部分,默认为.*?
*/
public AnsjPaser(String beginRegex, String endRegex, String content,
String textRegex) {
this.beginRegex = beginRegex;
this.endRegex = endRegex;
StringBuilder sb = new StringBuilder();
sb.append(beginRegex);
sb.append(textRegex);
sb.append(endRegex);
matcher = Pattern.compile(sb.toString()).matcher(content);
}
/**
*
* @param beginRegex 起始正则
* @param endRegex 结束正则
* @param textRegex 其实和结束正则中间的部分,默认为.*?
*/
public AnsjPaser(String beginRegex, String endRegex, String textRegex) {
this.beginRegex = beginRegex;
this.endRegex = endRegex;
StringBuilder sb = new StringBuilder();
sb.append(beginRegex);
sb.append(textRegex);
sb.append(endRegex);
matcher = Pattern.compile(sb.toString()).matcher(N);
}
/**
* @param beginRegex 起始正则
* @param endRegex 结束正则
*/
public AnsjPaser(String beginRegex, String endRegex) {
this.beginRegex = beginRegex;
this.endRegex = endRegex;
StringBuilder sb = new StringBuilder();
sb.append(beginRegex);
sb.append(TEXTTEGEX);
sb.append(endRegex);
matcher = Pattern.compile(sb.toString()).matcher(N);
}
/**
* @创建人:Ansj -创建时间:2011-8-16 下午09:30:56
* @方法描述: @return 返回正则内的内容去除了开始和结束标签,和需要过滤的正则返回用户需要的真正的内容
*/
public String getText() {
if (matcher.find()) {
String str = matcher.group().trim().replaceFirst(beginRegex, N)
.replaceAll(endRegex, N);
Iterator<String> it = filterRegexList.iterator() ;
while(it.hasNext()){
str = str.replaceAll(it.next(), N) ;
}
return str ;
}
return null;
}
/*
* 得到下一个
*/
public String getNext() {
return matcher.group();
}
/*
* 是否包含下一个
*/
public boolean hasNext() {
return matcher.find();
}
/**
* @创建人:Ansj -创建时间:2011-8-17 上午12:11:12
* @方法描述: @param content 需要解析的正文
* @方法描述: @return 返回本身
* 这个方法是将此解析器重置,相当于重头开始.但是一些正则配置给予保留
*/
public AnsjPaser reset(String content) {
this.matcher.reset(content);
return this ;
}
/*
* 添加getText的正则过滤条件
*/
public AnsjPaser addFilterRegex(String filterRegex){
filterRegexList.add(filterRegex) ;
return this ;
}
}
package com.ansj.sun.impl;
import java.io.BufferedReader;
import java.io.IOException;
import com.ansj.sun.pojo.AnsjPaser;
import com.ansj.sun.util.IOUtil;
public class HtmlPaser{
public static void main(String[] args) throws IOException {
//阅读正文
BufferedReader br = IOUtil.getReader(
"C:\\Users\\caiqing\\Desktop\\ajax采集\\zhanghuaping.html",
"UTF-8");
StringBuilder sb = new StringBuilder();
String temp = null;
while ((temp = br.readLine()) != null) {
sb.append(temp);
}
System.out.println(sb);
// 模块抽取
String beginRegex = "<div class=\"MIB_feed_c\">\\W*?<p class=\"sms\"";
String endRegex = "<div id=\"_comment_list_miniblog.*?\"></div>\\W*?</div>";
AnsjPaser ansjHtml = new AnsjPaser(beginRegex, endRegex, sb.toString(),
AnsjPaser.TEXTTEGEX);
// 正文抽取
beginRegex = "<p class=\"sms\" mid=\"\\d*?\" type=\"\\d*?\">";
endRegex = "</p>";
AnsjPaser ansjContent = new AnsjPaser(beginRegex, endRegex).addFilterRegex("<.*?>");
// 时间抽取
beginRegex = "onclick=\"GB_SUD