/*
* Zed Attack Proxy (ZAP) and its related class files.
*
* ZAP is an HTTP/HTTPS proxy for assessing web application security.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.zaproxy.zap.spider;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import net.htmlparser.jericho.Config;
import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.URIException;
import org.apache.log4j.Logger;
import org.parosproxy.paros.network.HttpMessage;
import org.parosproxy.paros.network.HttpRequestHeader;
import org.zaproxy.zap.spider.filters.FetchFilter;
import org.zaproxy.zap.spider.filters.FetchFilter.FetchStatus;
import org.zaproxy.zap.spider.filters.ParseFilter;
import org.zaproxy.zap.spider.parser.SpiderGitParser;
import org.zaproxy.zap.spider.parser.SpiderHtmlFormParser;
import org.zaproxy.zap.spider.parser.SpiderHtmlParser;
import org.zaproxy.zap.spider.parser.SpiderODataAtomParser;
import org.zaproxy.zap.spider.parser.SpiderParser;
import org.zaproxy.zap.spider.parser.SpiderParserListener;
import org.zaproxy.zap.spider.parser.SpiderRedirectParser;
import org.zaproxy.zap.spider.parser.SpiderRobotstxtParser;
import org.zaproxy.zap.spider.parser.SpiderSVNEntriesParser;
import org.zaproxy.zap.spider.parser.SpiderSitemapXMLParser;
import org.zaproxy.zap.spider.parser.SpiderTextParser;
/**
* The SpiderController is used to manage the crawling process and interacts directly with the
* Spider Task threads.
*/
public class SpiderController implements SpiderParserListener {
/** The fetch filters used by the spider to filter the resources which are fetched. */
private LinkedList<FetchFilter> fetchFilters;
/**
* The parse filters used by the spider to filter the resources which were fetched, but should
* not be parsed.
*/
private LinkedList<ParseFilter> parseFilters;
/** The parsers used by the spider. */
private LinkedList<SpiderParser> parsers;
private List<SpiderParser> parsersUnmodifiableView;
/** The spider. */
private Spider spider;
/** The resources visited using GET method. */
private Set<String> visitedGet;
/** The resources visited using POST method. */
private Map<String, ArrayList<String>> visitedPost;
/** The Constant log. */
private static final Logger log = Logger.getLogger(SpiderController.class);
/**
* Instantiates a new spider controller.
*
* @param spider the spider
* @param customParsers the custom spider parsers
*/
protected SpiderController(Spider spider, List<SpiderParser> customParsers) {
super();
this.spider = spider;
this.fetchFilters = new LinkedList<>();
this.parseFilters = new LinkedList<>();
this.visitedGet = new HashSet<>();
this.visitedPost = new HashMap<String, ArrayList<String>>();
prepareDefaultParsers();
for (SpiderParser parser : customParsers) {
this.addSpiderParser(parser);
}
}
private void prepareDefaultParsers() {
this.parsers = new LinkedList<>();
SpiderParser parser;
// If parsing of robots.txt is enabled
if (spider.getSpiderParam().isParseRobotsTxt()) {
parser = new SpiderRobotstxtParser(spider.getSpiderParam());
parsers.add(parser);
}
// If parsing of sitemap.xml is enabled
if (spider.getSpiderParam().isParseSitemapXml()) {
if (log.isDebugEnabled()) log.debug("Adding SpiderSitemapXMLParser");
parser = new SpiderSitemapXMLParser(spider.getSpiderParam());
parsers.add(parser);
} else {
if (log.isDebugEnabled()) log.debug("NOT Adding SpiderSitemapXMLParser");
}
// If parsing of SVN entries is enabled
if (spider.getSpiderParam().isParseSVNEntries()) {
parser = new SpiderSVNEntriesParser(spider.getSpiderParam());
parsers.add(parser);
}
// If parsing of GIT entries is enabled
if (spider.getSpiderParam().isParseGit()) {
parser = new SpiderGitParser(spider.getSpiderParam());
parsers.add(parser);
}
// Redirect requests parser
parser = new SpiderRedirectParser();
parsers.add(parser);
// Simple HTML parser
parser = new SpiderHtmlParser(spider.getSpiderParam());
this.parsers.add(parser);
// HTML Form parser
parser = new SpiderHtmlFormParser(spider.getSpiderParam(), spider.getExtensionSpider().getValueGenerator());
this.parsers.add(parser);
Config.CurrentCompatibilityMode.setFormFieldNameCaseInsensitive(false);
// Prepare the parsers for OData ATOM files
parser = new SpiderODataAtomParser();
this.parsers.add(parser);
// Prepare the parsers for simple non-HTML files
parser = new SpiderTextParser();
this.parsers.add(parser);
this.parsersUnmodifiableView = Collections.unmodifiableList(parsers);
}
/**
* Adds a new seed, if it wasn't already processed.
*
* @param uri the uri
* @param method the http method used for fetching the resource
*/
protected void addSeed(URI uri, String method) {
// Check if the uri was processed already
String visitedURI;
try {
visitedURI = URLCanonicalizer.buildCleanedParametersURIRepresentation(uri, spider.getSpiderParam()
.getHandleParameters(), spider.getSpiderParam().isHandleODataParametersVisited());
} catch (URIException e) {
return;
}
synchronized (visitedGet) {
if (visitedGet.contains(visitedURI)) {
log.debug("URI already visited: " + visitedURI);
return;
} else {
visitedGet.add(visitedURI);
}
}
// Create and submit the new task
SpiderTask task = new SpiderTask(spider, null, uri, 0, method);
spider.submitTask(task);
// Add the uri to the found list
spider.notifyListenersFoundURI(uri.toString(), method, FetchStatus.SEED);
}
/**
* Gets the fetch filters used by the spider during the spidering process.
*
* @return the fetch filters
*/
protected LinkedList<FetchFilter> getFetchFilters() {
return fetchFilters;
}
/**
* Adds a new fetch filter to the spider.
*
* @param filter the filter
*/
public void addFetchFilter(FetchFilter filter) {
log.debug("Loading fetch filter: " + filter.getClass().getSimpleName());
fetchFilters.add(filter);
}
/**
* Gets the parses the filters.
*
* @return the parses the filters
*/
protected LinkedList<ParseFilter> getParseFilters() {
return parseFilters;
}
/**
* Adds the parse filter to the spider controller.
*
* @param filter the filter
*/
public void addParseFilter(ParseFilter filter) {
log.debug("Loading parse filter: " + filter.getClass().getSimpleName());
parseFilters.add(filter);
}
public void init() {
visitedGet.clear();
visitedPost.clear();
for (SpiderParser parser : parsers) {
parser.addSpiderParserListener(this);
}
}
/**
* Clears the previous process.
*/
public void reset() {
visitedGet.clear();
visitedPost.clear();
for (SpiderParser parser : parsers) {
parser.removeSpiderParserListener(this);
}
}
@Override
public void resourceURIFound(HttpMessage responseMessage, int depth, String uri, boolean shouldIgnore) {
log.debug("New resource found: " + uri);
if (uri == null) {
return;
}
// Create the uri
URI uriV = createURI(uri);
if (uriV == null) {
return;
}
// Check if the uri was processed already
String visitedURI;
try {
visitedURI = URLCanonicalizer.buildCleanedParametersURIRepresentation(uriV, spider.getSpiderParam()
.getHandleParameters(), spider.getSpiderParam().isHandleODataParametersVisited());
} catch (URIException e) {
return;
}
synchronized (visitedGet) {
if (visitedGet.contains(visitedURI)) {
// log.debug("URI already visited: " + visitedURI);
return;
} else {
visitedGet.add(visitedURI);
}
}
// Check if any of the filters disallows this uri
for (FetchFilter f : fetchFilters) {
FetchStatus s = f.checkFilter(uriV);
if (s != FetchStatus.VALID) {
log.debug("URI: " + uriV + " was filtered by a filter with reason: " + s);
spider.notifyListenersFoundURI(uri, HttpRequestHeader.GET, s);
return;
}
}
// Check if should be ignored and not fetched
if (shouldIgnore) {
log.debug("URI: " + uriV + " is valid, but will not be fetched, by parser reccommendation.");
spider.notifyListenersFoundURI(uri, HttpRequestHeader.GET, FetchStatus.VALID);
return;
}
spider.notifyListenersFoundURI(uri, HttpRequestHeader.GET, FetchStatus.VALID);
// Submit the task
SpiderTask task = new SpiderTask(spider, responseMessage.getRequestHeader().getURI(), uriV, depth, HttpRequestHeader.GET);
spider.submitTask(task);
}
@Override
public void resourceURIFound(HttpMessage responseMessage, int depth, String uri) {
resourceURIFound(responseMessage, depth, uri, false);
}
@Override
public void resourcePostURIFound(HttpMessage responseMessage, int depth, String uri, String requestBody) {
log.debug("New POST resource found: " + uri);
// Check if the uri was processed already
synchronized (visitedPost) {
if(arrayKeyValueExists(uri, requestBody)) {
log.debug("URI already visited: " + uri);
return;
} else {
if(visitedPost.containsKey(uri)) {
visitedPost.get(uri).add(requestBody);
}
else {
ArrayList<String> l = new ArrayList<String>();
l.add(requestBody);
visitedPost.put(uri, l);
}
}
}
// Create the uri
URI uriV = createURI(uri);
if (uriV == null) {
return;
}
// Check if any of the filters disallows this uri
for (FetchFilter f : fetchFilters) {
FetchStatus s = f.checkFilter(uriV);
if (s != FetchStatus.VALID) {
log.debug("URI: " + uriV + " was filtered by a filter with reason: " + s);
spider.notifyListenersFoundURI(uri, HttpRequestHeader.POST, s);
return;
}
}
spider.notifyListenersFoundURI(uri, HttpRequestHeader.POST, FetchStatus.VALID);
// Submit the task
SpiderTask task = new SpiderTask(spider, responseMessage.getRequestHeader().getURI(), uriV, depth, HttpRequestHeader.POST, requestBody);
spider.submitTask(task);
}
/**
* Checks whether the value exists in an ArrayList of certain key.
*
* @param key the string of the uri
* @param value the request body of the uri
* @return true or false depending whether the uri and request body have already been processed
*/
private boolean arrayKeyValueExists(String key, String value) {
if (visitedPost.containsKey(key)) {
for(String s : visitedPost.get(key)) {
if(s.equals(value)) {
return true;
}
}
}
return false;
}
/**
* Creates the {@link URI} starting from the uri string. First it tries to convert it into a
* String considering it's already encoded and, if it fails, tries to create it considering it's
* not encoded.
*
* @param uri the string of the uri
* @return the URI, or null if an error occured and the URI could not be constructed.
*/
private URI createURI(String uri) {
URI uriV = null;
try {
// Try to see if we can create the URI, considering it's encoded.
uriV = new URI(uri, true);
} catch (URIException e) {
// An error occured, so try to create the URI considering it's not encoded.
try {
log.debug("Second try...");
uriV = new URI(uri, false);
} catch (Exception ex) {
log.error("Error while converting to uri: " + uri, ex);
return null;
}
// A non URIException occured, so just ignore the URI
} catch (Exception e) {
log.error("Error while converting to uri: " + uri, e);
return null;
}
return uriV;
}
/**
* Gets an unmodifiable view of the list of that should be used during the scan.
*
* @return the parsers
*/
public List<SpiderParser> getParsers() {
return parsersUnmodifiableView;
}
public void addSpiderParser(SpiderParser parser) {
log.debug("Loading custom Spider Parser: " + parser.getClass().getSimpleName());
this.parsers.addFirst(parser);
}
}