日期:2014-05-17 浏览次数:20743 次
import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.util.EntityUtils; public class Test { public static void main(String[] args) { try { String[] urlAry = new String[]{ //百度网页,"埇"字正常 "http://www.baidu.com/s?cl=3&wd=%CB%DE%88%AC%D6%B4", //百度知道,"埇"字乱码 "http://zhidao.baidu.com/q?word=%CB%DE%88%AC%D6%B4&lm=0&fr=search&ct=17&pn=0&tn=ikaslist&rn=10" }; for (String queryURL : urlAry) { DefaultHttpClient client = new DefaultHttpClient(); HttpGet httpget = new HttpGet(queryURL); HttpResponse response = client.execute(httpget); HttpEntity entity = response.getEntity(); String returnText = EntityUtils.toString(entity,"gb2312"); //网页代码 // System.out.println(returnText); //通过正则表达式,摘出要比较的部分 getTextByRule(".*?(宿.*?执).*",returnText); client.getConnectionManager().shutdown(); } } catch (Exception e) { e.printStackTrace(); } } public static void getTextByRule(String parttern, String str){ Pattern p = Pattern.compile(parttern); Matcher matcher = p.matcher(str); if(matcher.find()) { System.out.println(matcher.group(1)); } } }
String returnText = EntityUtils.toString(entity,"GBK");
------解决方案--------------------
有没有发现实际上这句中的编码根本没有起作用,defaultCharset只有在entity中未提供编码时才会起作用
String returnText = EntityUtils.toString(entity,"gb2312");
编码随便改,即使改成123也不会对结果有任何影响
public static String toString(HttpEntity entity,
String defaultCharset)
throws IOException,
ParseException
Get the entity content as a String, using the provided default character set if none is found in the entity. If defaultCharset is null, the default "ISO-8859-1" is used.
两个结果的差异是由各自的URL中带来的编码决定的,前者是GBK,后者是GB2312
因埇字在GB2312中无编码,在GBK中是88AC(十进制:34988),所以后者无法呈现。
------解决方案--------------------