日期:2014-05-17  浏览次数:20830 次

用HtmlUnit和httpClient抓施华洛世奇网站图片和动画<二>

try
????{
?????mainPage = webClient.getPage(url);
????} catch (Exception e)
????{
?????log.error(e.getMessage(), e);
????}
????if (mainPage != null)
????{
?????HtmlElement paginationContainer = mainPage
???????.getElementById("paginationContainer");
?????if (paginationContainer != null)
?????{
??????url = url + "/all-1";
??????try
??????{
???????mainPage = webClient.getPage(url);
??????} catch (Exception e)
??????{
???????log.error(e.getMessage(), e);
??????}
?????}
HtmlElement categories = mainPage.getElementById("categories");
?????if (categories == null)
?????{
??????HtmlElement products = mainPage.getElementById("products");
??????if(products!=null)
??????{
???????List<HtmlElement> productDivList = products.getElementsByTagName("div");
???????if(productDivList!=null&&productDivList.size()>0)
???????{
????????for(HtmlElement proDiv:productDivList)
????????{
?????????List<HtmlElement> subList = proDiv.getElementsByTagName("div");
?????????if(subList!=null)
?????????{
??????????for(HtmlElement dt:subList)
??????????{
???????????String classt = dt.getAttribute("class");
???????????if (classt != null && classt.equals("productImg"))
???????????{
????????????List<HtmlElement> subAList = proDiv.getElementsByTagName("a");
????????????if(subAList!=null&&subAList.size()>0)
????????????{
?????????????HtmlElement ae=subAList.get(0);
?????????????String href = ae.getAttribute("href");
?????????????thirdPageUrls.add(href);
????????????}
????????????break;
???????????}
??????????}
?????????}
????????}
???????}
??????}
?????}
?????else
?????{
????? List<HtmlElement> divList = categories.getElementsByTagName("div");
???????? if (divList != null && divList.size() > 0)
???????? {
???????for (HtmlElement div : divList)
???????{
????????List<HtmlElement> tempDivs = div
??????????.getElementsByTagName("div");
????????if (tempDivs != null && tempDivs.size() > 0)
????????{
?????????for (HtmlElement div1 : tempDivs)
?????????{
??????????List<HtmlElement> aList = div1
????????????.getElementsByTagName("a");
??????????HtmlAnchor a = (HtmlAnchor) aList
????????????.get(0);
??????????String link = a.getHrefAttribute();
??????????secondPageUrls.add(link);

?????????}

????????}
???????}
??????}
?????}
????

????}
???}
??}
??log.error("第二层抓取结束..........");
??log.error("目前抓取到的第二层URL个数为:"+secondPageUrls.size());

?

int count=0;
??
??for (String url : secondPageUrls)
??{
???count++;
???log.error("正在抓取第二层的第"+count+"个URL:"+url);
???HtmlPage mainPage = null;
???try
???{
????mainPage = webClient.getPage(url);
???} catch (Exception e)
???{
????log.error(e.getMessage(), e);
???}
???log.error("抓取URL完成:"+url+",正在分析URL"+url+"+结果的URL");
???if (mainPage != null)
???{
????HtmlElement paginationContainer = mainPage
??????.getElementById("paginationContainer");
????if (paginationContainer != null)
????{
?????url = url + "/all-1";
????}
????HtmlElement products = mainPage.getElementById("products");
????if (products != null)
????{
?????List<HtmlElement> list = products
???????.getElementsByTagName("div");
?????;
?????if (list == null || list.size() == 0)
?????{
??????continue;
?????}
?????for (HtmlElement h : list)
?????{
??????String cls = h.getAttribute("class");
??????if (cls == null || !cls.equals("productName"))
??????{
???????continue;
??????}
??????List<HtmlElement> links = h.getElementsByTagName("a");
??????if (links != null && links.size() &