日期:2014-05-17 浏览次数:20734 次
import java.io.IOException;
import jpcap.JpcapCaptor;
import jpcap.NetworkInterface;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class IDMBDownloader {
/**
* @param args
*/
public static void main(String[] args){
// TODO Auto-generated method stub
//get front page
String baseURL = "http://www.imdb.com/genre/";
IDMBDownloader.URLprocess(baseURL);
}
public IDMBDownloader(){
}
public static void URLprocess(String baseURL)
{
Document baseDocument = null;
try {
baseDocument = Jsoup.connect(baseURL).userAgent("Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20100101 Firefox/22.0")
.ignoreContentType(true).timeout(30000).get();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("Entry success!");
Element genreTable = baseDocument.select("table[class^=genre-table]").get(0);
//get genre item
Elements gList = genreTable.select("h3");
String gTitle = null;
String gURL = null;
for(Element gType:gList){
Element gType1 = gList.select("a[href]").get(0);
/*
* for the text is like "Action "
* it has a Up-class letter and a ' '
*/
String s = gType1.text();
gTitle = s.substring(0, s.length()-2).toLowerCase();
gURL = gType1.attr("href");
System.out.println("Genre: " + gTitle);
//process
for(int i=1;i<1000;i+=50){
Document doc = null;
String sURL = "http://www.imdb.com/search/title?at=0&genres=" + gTitle + "&sort=moviemeter,asc&start=" + i + "&title_type=feature";
System.out.println("Search URL: " + sURL);
try {
doc = Jsoup.connect(sURL).userAgent("Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20100101 Firefox/22.0")
.ignoreContentType(true).timeout(30000).get();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("Genre page entry success!");
//Element movieDIV
Element movieTable = doc.select("table[class=results]").get(0);
Elements mList = movieTable.select("td[class=title]");
int j=i;
String mTitle = null;
String mURL = null;
for(Element mType:mList){
Element mType1 = mType.select("a[href]").get(0);
mTitle = mType1.text();
mURL = mType1.attr("href");
System.out.println("Tit