日期:2014-05-16 浏览次数:20620 次
/* Copyright (C) 2003 Internet Archive.
*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Heritrix is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Heritrix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Heritrix; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Created on Nov 17, 2003
*
* To change the template for this generated file go to
* Window>Preferences>Java>Code Generation>Code and Comments
*/
package org.archive.crawler.extractor;
import java.io.IOException;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import org.apache.commons.httpclient.URIException;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.framework.CrawlController;
import org.archive.io.ReplayCharSequence;
import org.archive.net.UURI;
import org.archive.util.DevUtils;
import org.archive.util.TextUtils;
import org.archive.util.UriUtils;
/**
* Processes Javascript files for strings that are likely to be
* crawlable URIs.
*
* @contributor gojomo
* @contributor szznax
*
*/
public class ExtractorJS extends Extractor implements CoreAttributeConstants {
private static final long serialVersionUID = -2231962381454717720L;
private static Logger LOGGER =
Logger.getLogger("org.archive.crawler.extractor.ExtractorJS");
// finds whitespace-free strings in Javascript
// (areas between paired ' or " characters, possibly backslash-quoted
// on the ends, but not in the middle)
static final String JAVASCRIPT_STRING_EXTRACTOR =
"(\\\\{0,8}+(?:\"|\'))(\\S{0,"+UURI.MAX_URL_LENGTH+"}?)(?:\\1)";
// GROUPS:
// (G1) ' or " with optional leading backslashes
// (G2) whitespace-free string delimited on boths ends by G1
protected long numberOfCURIsHandled = 0;
protected static long numberOfLinksExtracted = 0;
// URIs known to produce false-positives with the current JS extractor.
// e.g. currently (2.0.3) the JS extractor produces 13 false-positive
// URIs from http://www.google-analytics.com/urchin.js and only 2
// good URIs, which are merely one pixel images.
// TODO: remove this blacklist when JS extractor is improved
protected final static String[] EXTRACTOR_URI_EXCEPTIONS = {
"http://www.google-analytics.com/urchin.js"
};
/**
* @param name
*/
public ExtractorJS(String name) {
super(name, "JavaScript extractor. Link extraction on JavaScript" +
" files (.js).");
}
/* (non-Javadoc)
* @see org.archive.crawler.framework.Processor#process(org.archive.crawler.datamodel.CrawlURI)
*/
public void extract(CrawlURI curi) {
// special-cases, for when we know our current JS extractor does poorly.
// TODO: remove this test when JS extractor is improved
for (String s: EXTRACTOR_URI_EXCEPTIONS) {
if (curi.toString().equals(s))
return;
}
if (!isHttpTransactionContentToProcess(curi)) {
return;
}
String contentType = curi.getContentType();
if ((contentType == null)) {
return;
}
// If content type is not js and if the viaContext
// does not begin with 'script',