日期:2014-05-17 浏览次数:20712 次
package com.ansj.sun.pojo; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class AnsjPaser { private String beginRegex; private String endRegex; private Matcher matcher; public final static String TEXTTEGEX = ".*?"; public final static String W = "\\W*?" ; public final static String N = "" ; private List<String> filterRegexList = new ArrayList<String>(); /** * * @param beginRegex 起始正则 * @param endRegex 结束正则 * @param content 需要解析的正文(如果没有此项必须为rest设置) * @param textRegex 其实和结束正则中间的部分,默认为.*? */ public AnsjPaser(String beginRegex, String endRegex, String content, String textRegex) { this.beginRegex = beginRegex; this.endRegex = endRegex; StringBuilder sb = new StringBuilder(); sb.append(beginRegex); sb.append(textRegex); sb.append(endRegex); matcher = Pattern.compile(sb.toString()).matcher(content); } /** * * @param beginRegex 起始正则 * @param endRegex 结束正则 * @param textRegex 其实和结束正则中间的部分,默认为.*? */ public AnsjPaser(String beginRegex, String endRegex, String textRegex) { this.beginRegex = beginRegex; this.endRegex = endRegex; StringBuilder sb = new StringBuilder(); sb.append(beginRegex); sb.append(textRegex); sb.append(endRegex); matcher = Pattern.compile(sb.toString()).matcher(N); } /** * @param beginRegex 起始正则 * @param endRegex 结束正则 */ public AnsjPaser(String beginRegex, String endRegex) { this.beginRegex = beginRegex; this.endRegex = endRegex; StringBuilder sb = new StringBuilder(); sb.append(beginRegex); sb.append(TEXTTEGEX); sb.append(endRegex); matcher = Pattern.compile(sb.toString()).matcher(N); } /** * @创建人:Ansj -创建时间:2011-8-16 下午09:30:56 * @方法描述: @return 返回正则内的内容去除了开始和结束标签,和需要过滤的正则返回用户需要的真正的内容 */ public String getText() { if (matcher.find()) { String str = matcher.group().trim().replaceFirst(beginRegex, N) .replaceAll(endRegex, N); Iterator<String> it = filterRegexList.iterator() ; while(it.hasNext()){ str = str.replaceAll(it.next(), N) ; } return str ; } return null; } /* * 得到下一个 */ public String getNext() { return matcher.group(); } /* * 是否包含下一个 */ public boolean hasNext() { return matcher.find(); } /** * @创建人:Ansj -创建时间:2011-8-17 上午12:11:12 * @方法描述: @param content 需要解析的正文 * @方法描述: @return 返回本身 * 这个方法是将此解析器重置,相当于重头开始.但是一些正则配置给予保留 */ public AnsjPaser reset(String content) { this.matcher.reset(content); return this ; } /* * 添加getText的正则过滤条件 */ public AnsjPaser addFilterRegex(String filterRegex){ filterRegexList.add(filterRegex) ; return this ; } }
package com.ansj.sun.impl; import java.io.BufferedReader; import java.io.IOException; import com.ansj.sun.pojo.AnsjPaser; import com.ansj.sun.util.IOUtil; public class HtmlPaser{ public static void main(String[] args) throws IOException { //阅读正文 BufferedReader br = IOUtil.getReader( "C:\\Users\\caiqing\\Desktop\\ajax采集\\zhanghuaping.html", "UTF-8"); StringBuilder sb = new StringBuilder(); String temp = null; while ((temp = br.readLine()) != null) { sb.append(temp); } System.out.println(sb); // 模块抽取 String beginRegex = "<div class=\"MIB_feed_c\">\\W*?<p class=\"sms\""; String endRegex = "<div id=\"_comment_list_miniblog.*?\"></div>\\W*?</div>"; AnsjPaser ansjHtml = new AnsjPaser(beginRegex, endRegex, sb.toString(), AnsjPaser.TEXTTEGEX); // 正文抽取 beginRegex = "<p class=\"sms\" mid=\"\\d*?\" type=\"\\d*?\">"; endRegex = "</p>"; AnsjPaser ansjContent = new AnsjPaser(beginRegex, endRegex).addFilterRegex("<.*?>"); // 时间抽取 beginRegex = "onclick=\"GB_SUD