日期:2014-05-17  浏览次数:20510 次

Curl 采集乱码与采集不到 PHP
PHP程序是用gbk2312编码的:

<?php
$url = "http://www.sina.com.cn";//gbk2312编码
//$url = "http://www.163.com";//gbk2312编码
//$url = "http://www.sohu.com";//gbk2312编码
 

  $ch = curl_init($url);
  curl_setopt($ch, CURLOPT_RETURNTRANSFER , true);//返回获取的输出的文本流
  $ret = curl_exec($ch);
  curl_setopt($ch, CURLOPT_TIMEOUT, 1);
  curl_close($ch);
  echo $ret;

?>

在采集sina.com.cn时,是正常的,但是采集163.com时是为空的,采集sohu.com时是丢码的.
这是怎么回事呢?如何解决?有哪位怎么呀?先谢谢了!!!没多少分了,不好意思。

------解决方案--------------------
别的不说,我就是来拿分的.楼主记得给全分

PHP code


$curl=curl_init('http://www.163.com');
curl_setopt($curl,CURLOPT_RETURNTRANSFER,1);
curl_setopt($curl,CURLOPT_USERAGENT,'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322)');
$html=curl_exec($curl);
var_dump($html);


$curl=curl_init('http://www.sohu.com');
curl_setopt($curl,CURLOPT_RETURNTRANSFER,1);
curl_setopt($curl,CURLOPT_USERAGENT,'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322)');
$html=curl_exec($curl);
//$html=strstr($html,'<');
$html=gzdecode($html);
var_dump($html);


function gzdecode($data) {   
  $len = strlen($data);   
  if ($len < 18 || strcmp(substr($data,0,2),"\x1f\x8b")) {   
   return null;  // Not GZIP format (See RFC 1952)   
  }   
  $method = ord(substr($data,2,1));  // Compression method   
  $flags  = ord(substr($data,3,1));  // Flags   
  if ($flags & 31 != $flags) {   
   // Reserved bits are set -- NOT ALLOWED by RFC 1952   
   return null;   
  }   
  // NOTE: $mtime may be negative (PHP integer limitations)   
  $mtime = unpack("V", substr($data,4,4));   
  $mtime = $mtime[1];   
  $xfl  = substr($data,8,1);   
  $os    = substr($data,8,1);   
  $headerlen = 10;   
  $extralen  = 0;   
  $extra    = "";   
  if ($flags & 4) {   
   // 2-byte length prefixed EXTRA data in header   
   if ($len - $headerlen - 2 < 8) {   
     return false;    // Invalid format   
   }   
   $extralen = unpack("v",substr($data,8,2));   
   $extralen = $extralen[1];   
   if ($len - $headerlen - 2 - $extralen < 8) {   
     return false;    // Invalid format   
   }   
   $extra = substr($data,10,$extralen);   
   $headerlen += 2 + $extralen;   
  }   
  
  $filenamelen = 0;   
  $filename = "";   
  if ($flags & 8) {   
   // C-style string file NAME data in header   
   if ($len - $headerlen - 1 < 8) {   
     return false;    // Invalid format   
   }   
   $filenamelen = strpos(substr($data,8+$extralen),chr(0));   
   if ($filenamelen === false || $len - $headerlen - $filenamelen - 1 < 8) {   
     return false;    // Invalid format   
   }   
   $filename = substr($data,$headerlen,$filenamelen);   
   $headerlen += $filenamelen + 1;   
  }   
  
  $commentlen = 0;   
  $comment = "";   
  if ($flags & 16) {   
   // C-style string COMMENT data in header   
   if ($len - $headerlen - 1 < 8) {   
     return false;    // Invalid format   
   }   
   $commentlen = strpos(substr($data,8+$extralen+$filenamelen),chr(0));   
   if ($commentlen === false || $len - $headerlen - $commentlen - 1 < 8) {   
     return false;    // Invalid header format   
   }   
   $comment = substr($data,$headerlen,$commentlen);   
   $headerlen += $commentlen + 1;   
  }   
  
  $headercrc = "";   
  if ($flags & 1) {   
   // 2-bytes (lowest order) of CRC32 on header present   
   if ($len - $headerlen - 2 < 8) {   
     return false;    // Invalid format   
   }   
   $calccrc = crc32(substr($data,0,$headerlen)) & 0xffff;   
   $headercrc = unpack("v", substr($data,$headerlen,2));   
   $headercrc = $headercrc[1];   
   if ($headercrc != $calccrc) {   
     return false;    // Bad header CRC