日期:2014-05-17 浏览次数:20747 次
<url> <loc>http://www.google.com.hk</loc> <lastmod>2010-09-29</lastmod> <changefreq>weekly</changefreq> <priority>0.7</priority> </url>
public interface CrawlUrl { boolean canCrawl(); void disable(); }
import org.apache.commons.lang.StringUtils; import com.redfin.sitemapgenerator.WebSitemapUrl; public class ExtWebSiteMapUrl extends WebSitemapUrl implements CrawlUrl { public ExtWebSiteMapUrl(Options options) { super(options); } private boolean canCrawl = true; @Override public boolean canCrawl() { return canCrawl; } @Override public void disable() { canCrawl = false; } @Override public boolean equals(Object obj) { if (obj == null) { return false; } if (obj instanceof ExtWebSiteMapUrl) { ExtWebSiteMapUrl url = (ExtWebSiteMapUrl) obj; return StringUtils.equals(url.getUrlStr(), getUrlStr()); } return false; } public String getUrlStr() { return super.getUrl().toExternalForm(); } }
import java.io.File; import java.net.MalformedURLException; import java.util.Date; import java.util.LinkedList; import java.util.Queue; import java.util.TimeZone; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang.StringUtils; import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import com.redfin.sitemapgenerator.ChangeFreq; import com.redfin.sitemapgenerator.W3CDateFormat; import com.redfin.sitemapgenerator.WebSitemapGenerator; import com.redfin.sitemapgenerator.WebSitemapUrl; public class HtmlCrawler { private static NodeClassFilter LINK_FILTER = new NodeClassFilter( LinkTag.class); private static Parser parser = new Parser(); private static File dir = new File("D:\\sitemaptest"); private static String BASE_PREFIX = "http://www.xxxx.com"; private static WebSitemapGenerator wsg = null; static { W3CDateFormat dateFormat = new W3CDateFormat(W3CDateFormat.Pattern.DAY); dateFormat.setTimeZone(TimeZone.getTimeZone("GMT+8")); try { wsg = WebSitemapGenerator.builder(BASE_PREFIX, dir).dateFormat( dateFormat).build(); } catch (MalformedURLException e) { System.out.println("the start url [" + BASE_PREFIX + "] is malformed"); } } public static void main(String[] args) throws ParserException, MalformedURLException { ExtWebSiteMapUrl startUrl = new ExtWebSiteMapUrl( new WebSitemapUrl.Options("http://www.xxxx.com").lastMod( new Date()).priority(0.9).changeFreq(ChangeFreq.WEEKLY)); Queue<ExtWebSiteMapUrl> queue = new LinkedList<ExtWebSiteMapUrl>(); queue.add(startUrl); crawl(queue, wsg); System.out.println("done")