日期:2014-05-20  浏览次数:20669 次

如何查找UTF-8字符串中的字符串?(字符编码问题)
我用apache httpclient 4.1.1抓取网页,用String的indexof方法搜索其中是否含有感兴趣的关键字,搜索GBK、GB2312编码网页时正常,遇到UFT-8编码网页就无法搜索,抓取下来的中文内容打印出来也是无法辨认。肯定是编码问题了,不知该怎么解决。搜索了好长时间,试了各种转换编码方法,但都不能把抓取下来的中文内容正常打印出来,搜索也都是-1.

------解决方案--------------------
看楼主也很纠结的;当学习,下载了apache httpclient 4.1.3,给个示例(使用探测工具探测编码失败,就不写了):
Java code

import java.io.BufferedReader;
import java.io.InputStreamReader;

import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;

public class EncodedPostTest {
    public static void main(String[] args) throws Exception {
        HttpClient httpclient = new DefaultHttpClient();
        BufferedReader bufReader = null;
        String charset = "";
        try {
            HttpPost httppost = new HttpPost(
                    "http://localhost:8080/TestJEEProject/EncodingServlet");
            HttpResponse response = httpclient.execute(httppost);
            if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                if (response.getEntity().getContentEncoding() != null) {
                    charset = response.getEntity().getContentEncoding().getValue();
                }else if(response.getEntity().getContentType() != null){
                    String contentType = response.getEntity().getContentType().getValue().toLowerCase().replaceAll("\\s*", "");
                    charset = contentType.substring(contentType.indexOf("charset=") + "charset=".length());
                }else{
//                    //TODO: 使用默认字符编码
                    charset = "gbk";
                }
                
                System.out.println("Charset : " + charset);
                
                bufReader = new BufferedReader(new InputStreamReader(response.getEntity().getContent(), charset));
                String strValue = bufReader.readLine();
                while(strValue != null){
                    if(strValue.indexOf("编码") != -1){
                        System.out.println(strValue);
                    }
                    strValue = bufReader.readLine();
                }
                
            } else {
                System.out.println("Unexpected failure: "
                        + response.getStatusLine().toString());
            }
        } finally {
            httpclient.getConnectionManager().shutdown();
            if(bufReader != null){
                bufReader.close();
            }
        }
    }
}