Socket下载页面部分出现乱码,请大家帮忙解决
采集某一站点页面,该站点页面使用gb2312编码,但是发现采集后的部分页面中会出现少量的乱码。困扰很久了,请大家帮忙,以下是采集部分的代码。请大家看看,是哪里有问题?
#region public static string GetClientBySocket(string UrlString) //通过Socket取得页面
/// <summary>
/// 通过Socket取得页面
/// </summary>
/// <param name= "UrlString "> </param>
/// <returns> </returns>
public static string GetClientBySocket(string UrlString)
{
string HostName = URLHelper.GetHostName(UrlString);
IPAddress[] ips = Dns.GetHostAddresses(HostName);
IPAddress ip = ips[0];
IPEndPoint serverhost = new IPEndPoint(ip, 80);
Socket clientSocket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
clientSocket.Connect(serverhost);
string httpReq = "GET " + UrlString + " HTTP/1.0 \r\n ";
httpReq += "Host: " + HostName + " \r\n ";
httpReq += "User-Agent: Mozilla/4.0 (compatible; MSIE 5.00; Windows 98) \r\n ";
httpReq += "Accept:*/* \r\n ";
httpReq += "Connection:Keep-Alive \r\n\r\n ";
string txtHTML= " ";
try
{
clientSocket.Send(System.Text.Encoding.ASCII.GetBytes(httpReq));
Byte[] buffer = new byte[10240];