日期:2014-05-16  浏览次数:20430 次

JSP中截取在线编辑器的字符串的处理
原文地址:http://www.cn-java.com/www1/?uid-572544-action-viewspace-itemid-54784

截取字符串的同时保留原有的标记并补全

package yidwo.com;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import java.util.regex.Matcher;

/**
 * 按字节长度截取字符串(支持截取带HTML代码样式的字符串)
 * @param param 将要截取的字符串参数
 * @param length 截取的字节长度
 * @param end 字符串末尾补上的字符串
 * @return 返回截取后的字符串
 */

public class SubStringHTML 
{
	public String subStringHTML(String param,int length,String end) {
	    StringBuffer result = new StringBuffer();
	    int n = 0;
	    char temp;
	    boolean isCode = false; //是不是HTML代码
	    boolean isHTML = false; //是不是HTML特殊字符,如 
	    for (int i = 0; i < param.length(); i++) {
	      temp = param.charAt(i);
	      if (temp == '<') {
	        isCode = true;
	      }
	      else if (temp == '&') {
	        isHTML = true;
	      }
	      else if (temp == '>' && isCode) {
	        n = n - 1;
	        isCode = false;
	      }
	      else if (temp == ';' && isHTML) {
	        isHTML = false;
	      }

	      if (!isCode && !isHTML) {
	        n = n + 1;
	        //UNICODE码字符占两个字节
	        if ( (temp + "").getBytes().length > 1) {
	          n = n + 1;
	        }
	      }

	      result.append(temp);
	      if (n >= length) {
	        break;
	      }
	    }
	    result.append(end);
	    //取出截取字符串中的HTML标记
	    String temp_result = result.toString().replaceAll("(>)[^<>]*(<?)", "$1$2");
	    //去掉不需要结素标记的HTML标记
	    temp_result = temp_result.replaceAll("</?(AREA|BASE|BASEFONT|BODY|BR|COL|COLGROUP|DD|DT|FRAME|HEAD|HR|HTML|IMG|INPUT|ISINDEX|LI|LINK|META|OPTION|P|PARAM|TBODY|TD|TFOOT|TH|THEAD|TR|area|base|basefont|body|br|col|colgroup|dd|dt|frame|head|hr|html|img|input|isindex|li|link|meta|option|p|param|tbody|td|tfoot|th|thead|tr)[^<>]*/?>",
	                                         "");
	    //去掉成对的HTML标记
	    temp_result=temp_result.replaceAll("<([a-zA-Z]+)[^<>]*>(.*?)</\\1>","$2");
	    //用正则表达式取出标记
	    Pattern p = Pattern.compile("<([a-zA-Z]+)[^<>]*>");
	    Matcher m = p.matcher(temp_result);

	    List endHTML = new ArrayList();

	    while (m.find()) {
	      endHTML.add(m.group(1));
	    }
	    //补全不成对的HTML标记
	    for (int i = endHTML.size() - 1; i >= 0; i--) {
	      result.append("</");
	      result.append(endHTML.get(i));
	      result.append(">");
	    }

	    return result.toString();
	  }

}



截取纯文本字符串

public String subStringHTML(String param,int length,String end){ 
       if(param.length()<length){
        return param;
       }else{
     StringBuffer result = new StringBuffer(); 
        int n = 0; 
        char temp; 
        boolean isCode = false;
        boolean isHTML = false; 
        String strResult = "";
        for(int i=0;i<param.length();i++){ 
            temp = param.charAt(i); 
            if(temp=='<'){ 
                isCode = true; 
            }else if(temp == '&'){ 
                isHTML   =   true; 
            }else if (temp == '>' && isCode){ 
                n = n - 1; 
                isCode = false; 
            }else if(temp == ';' && isHTML){ 
                isHTML = false; 
            } 
 
            if(!isCode && temp!='>'){ 
                n = n+1; 
                if((temp+" ").getBytes().length>1){ 
                    n = n+1; 
                } 
                result.append(temp); 
            } 
            if(n>=length){ 
             break; 
            } 
        } 
        if(isHTML){
            String str[]=result.toString().split("&");
            for(int j=0;j<str.length-1;j++){
             strResult += (j==0?"":"&")+str[j];
            }
           }else{
            strResult = result.toString();
           }
        strResult += end;
        return strResult;