日期:2014-05-17 浏览次数:20915 次
<url> <loc>http://www.google.com.hk</loc> <lastmod>2010-09-29</lastmod> <changefreq>weekly</changefreq> <priority>0.7</priority> </url>
public interface CrawlUrl {
boolean canCrawl();
void disable();
}
import org.apache.commons.lang.StringUtils;
import com.redfin.sitemapgenerator.WebSitemapUrl;
public class ExtWebSiteMapUrl extends WebSitemapUrl implements CrawlUrl {
public ExtWebSiteMapUrl(Options options) {
super(options);
}
private boolean canCrawl = true;
@Override
public boolean canCrawl() {
return canCrawl;
}
@Override
public void disable() {
canCrawl = false;
}
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (obj instanceof ExtWebSiteMapUrl) {
ExtWebSiteMapUrl url = (ExtWebSiteMapUrl) obj;
return StringUtils.equals(url.getUrlStr(), getUrlStr());
}
return false;
}
public String getUrlStr() {
return super.getUrl().toExternalForm();
}
}
import java.io.File;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.LinkedList;
import java.util.Queue;
import java.util.TimeZone;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang.StringUtils;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import com.redfin.sitemapgenerator.ChangeFreq;
import com.redfin.sitemapgenerator.W3CDateFormat;
import com.redfin.sitemapgenerator.WebSitemapGenerator;
import com.redfin.sitemapgenerator.WebSitemapUrl;
public class HtmlCrawler {
private static NodeClassFilter LINK_FILTER = new NodeClassFilter(
LinkTag.class);
private static Parser parser = new Parser();
private static File dir = new File("D:\\sitemaptest");
private static String BASE_PREFIX = "http://www.xxxx.com";
private static WebSitemapGenerator wsg = null;
static {
W3CDateFormat dateFormat = new W3CDateFormat(W3CDateFormat.Pattern.DAY);
dateFormat.setTimeZone(TimeZone.getTimeZone("GMT+8"));
try {
wsg = WebSitemapGenerator.builder(BASE_PREFIX, dir).dateFormat(
dateFormat).build();
} catch (MalformedURLException e) {
System.out.println("the start url [" + BASE_PREFIX
+ "] is malformed");
}
}
public static void main(String[] args) throws ParserException,
MalformedURLException {
ExtWebSiteMapUrl startUrl = new ExtWebSiteMapUrl(
new WebSitemapUrl.Options("http://www.xxxx.com").lastMod(
new Date()).priority(0.9).changeFreq(ChangeFreq.WEEKLY));
Queue<ExtWebSiteMapUrl> queue = new LinkedList<ExtWebSiteMapUrl>();
queue.add(startUrl);
crawl(queue, wsg);
System.out.println("done")