/*
* Zed Attack Proxy (ZAP) and its related class files.
*
* ZAP is an HTTP/HTTPS proxy for assessing web application security.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.zaproxy.zap.spider.parser;
import java.io.ByteArrayInputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import net.htmlparser.jericho.Source;
import org.parosproxy.paros.network.HttpMessage;
import org.parosproxy.paros.network.HttpStatusCode;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.zaproxy.zap.spider.SpiderParam;
import org.zaproxy.zap.utils.XmlUtils;
/**
* SitemapXMLParser is used for parsing URLs from a sitemap.xml file, which sometimes (very helpfully) resides in the web root.
* @author 70pointer
*
*/
public class SpiderSitemapXMLParser extends SpiderParser {
/** a pattern to match the sitemap.xml file name*/
private Pattern SITEMAP_XML_FILENAME_PATTERN = Pattern.compile("/sitemap\\.xml$");
/** a pattern to match the sitemap.xml file.. hint: It's XML*/
private static final Pattern xmlPattern = Pattern.compile ("^<\\?xml\\s+version\\s*=\\s*\"[0-9.]+\"\\s+encoding\\s*=\\s*\"[^\"]+\"\\s*\\?>");
/** The Spider parameters. */
private SpiderParam params;
/** used to parse the XML based file format */
private static DocumentBuilder dBuilder;
/**
* an x path expression to match the "loc" tag in sitemap.xml
*/
private static XPathExpression xpathLocationExpression;
/** statically initialise the XML DocumentBuilderFactory and DocumentBuilder */
static {
try {
dBuilder = XmlUtils.newXxeDisabledDocumentBuilderFactory().newDocumentBuilder();
XPath xpath = XPathFactory.newInstance().newXPath();
xpathLocationExpression = xpath.compile("/urlset/url/loc/text()");
} catch (ParserConfigurationException | XPathExpressionException e) {
log.error(e);
}
}
/**
* Instantiates a new sitemap.xml parser.
*
* @param params the params
* @throws IllegalArgumentException if {@code params} is null.
*/
public SpiderSitemapXMLParser(SpiderParam params) {
super();
if (params == null) {
throw new IllegalArgumentException("Parameter params must not be null.");
}
this.params = params;
}
@Override
public boolean parseResource(HttpMessage message, Source source, int depth) {
if (log.isDebugEnabled()) log.debug("Parsing a sitemap.xml resource...");
if (message == null || !params.isParseSitemapXml() ||
!message.getResponseHeader().isXml() ||
HttpStatusCode.isClientError(message.getResponseHeader().getStatusCode()) ||
HttpStatusCode.isServerError(message.getResponseHeader().getStatusCode())) {
return false;
}
// Get the response content
byte [] response = message.getResponseBody().getBytes();
String baseURL = message.getRequestHeader().getURI().toString();
Matcher xmlFormatMatcher = xmlPattern.matcher(new String (response));
if (xmlFormatMatcher.find()) {
if (log.isDebugEnabled()) log.debug("The format matches XML");
try {
Document xmldoc = dBuilder.parse(new InputSource(new ByteArrayInputStream(response)));
NodeList locationNodes = (NodeList) xpathLocationExpression.evaluate(xmldoc, XPathConstants.NODESET);
for (int i = 0; i < locationNodes.getLength(); i++) {
processURL(message, depth, locationNodes.item(i).getNodeValue(), baseURL);
}
}
catch (Exception e) {
log.error("An error occurred trying to parse sitemap.xml", e);
return false;
}
// We consider the message fully parsed, so it doesn't get parsed by 'fallback' parsers
return true;
} else {
//the file name is right, but the content is not. Pass it to another parser.
if (log.isDebugEnabled()) log.debug("The content of the response from '"+ baseURL + "' does not match the expected content for a sitemap.xml file. Ignoring it.");
return false;
}
}
@Override
public boolean canParseResource(HttpMessage message, String path, boolean wasAlreadyParsed) {
if (log.isDebugEnabled()) log.debug("canParseResource called on '"+ path + "'");
// matches the file name of files that should be parsed with the sitemap.xml file parser
Matcher matcher = SITEMAP_XML_FILENAME_PATTERN.matcher(path);
return matcher.find();
}
}