日期:2014-05-20  浏览次数:20754 次

写了一个搜索引擎的下载模块,大家给看看能不能用吧,希望大家多提点意见
Java code
 
public class getpage { //用于下载网页文档

/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
try{
int sum=0;
BufferedReader reader=new BufferedReader(new InputStreamReader(new FileInputStream("config.txt")));
while(true)
{
String add=reader.readLine();
if(add==null)break;
Socket s=new Socket(add.substring(0, add.indexOf("/")),80);
DataInputStream in=new DataInputStream(s.getInputStream());
DataOutputStream out=new DataOutputStream(s.getOutputStream());
PrintStream outp=new PrintStream(out);
outp.println("GET /"+add.substring(add.indexOf("/")+1, add.length())+" HTTP/1.1");
outp.println("Host: "+add.substring(0, add.indexOf("/")));
outp.println("Connection: Keep_Alive");
outp.println();
byte b[]=new byte[100000];
in.read(b);
String string=new String(b);
string=string.substring(string.indexOf(" <"), 1+string.lastIndexOf(">"));

PrintStream file=new PrintStream("page"+sum+++".txt");
file.println(add.substring(0, add.lastIndexOf("/")+1));
file.println(string);
s.close();
}
}catch(Exception e){System.out.println(e);}
}
}
public class gettext { //用于提取网页文本

/**
* @param args
*/
public static void main(String[] args) {
try{
FileInputStream input=new FileInputStream("page.txt");
PrintStream out=new PrintStream("url.txt");
byte b[]=new byte[100000];
input.read(b);
String s=new String(b);
s=s.replaceAll("\n", "");
s=s.replaceAll(" ", "");
s=s.replaceAll("&nbsp;", "");
int first=0;
while(true)
{
first=s.indexOf(">",first+1);
if(first==-1)break;
int second=s.indexOf(" <",first);
if(second==-1)break;
out.print(s.substring(first+1, second));
}
}catch(Exception e){System.out.println(e);}
}
}
public class geturl {  //用于在指定网页文档中提取URL

/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
try{
PrintStream out=new PrintStream("url.txt");
int i=0;
while(true)
{
BufferedReader reader=new BufferedReader(new InputStreamReader(new FileInputStream("page"+i+++".txt")));
String a=reader.readLine();
while(true)
{
String url=reader.readLine();
if(url==null)break;
int first=url.indexOf(".htm");
if(first==-1)continue;
for(;url.charAt(first)!='\"';first--);first++;
int second;
for(second=first+1;url.charAt(second)!='\"'||second>=url.length();second++);second--;
if(first>=second)continue;
if(url.indexOf("www")==-1&&url.indexOf("http")==-1)
out.println(a+url.substring(first,second+1));
}
}
}catch(Exception e){System.out.println(e);
}
}
}


------解决方案--------------------
jiefen~~~
------解决方案--------------------
....
------解决方案--------------------