package com.crawljax.util;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableList.Builder;
/**
* Utility class that contains methods used by Crawljax and some plugin to deal with XPath
* resolving, constructing etc.
*/
public final class XPathHelper {
private static final Pattern TAG_PATTERN = Pattern
.compile("(?<=[/|::])[a-zA-z]+(?=([/|\\[]|$))");
private static final Pattern ID_PATTERN = Pattern.compile("(@[a-zA-Z]+)");
private static final String FULL_XPATH_CACHE = "FULL_XPATH_CACHE";
private static final int MAX_SEARCH_LOOPS = 10000;
/**
* Reverse Engineers an XPath Expression of a given Node in the DOM.
*
* @param node
* the given node.
* @return string xpath expression (e.g., "/html[1]/body[1]/div[3]").
*/
public static String getXPathExpression(Node node) {
Object xpathCache = node.getUserData(FULL_XPATH_CACHE);
if (xpathCache != null) {
return xpathCache.toString();
}
Node parent = node.getParentNode();
if ((parent == null) || parent.getNodeName().contains("#document")) {
String xPath = "/" + node.getNodeName() + "[1]";
node.setUserData(FULL_XPATH_CACHE, xPath, null);
return xPath;
}
StringBuffer buffer = new StringBuffer();
if (parent != node) {
buffer.append(getXPathExpression(parent));
buffer.append("/");
}
buffer.append(node.getNodeName());
List<Node> mySiblings = getSiblings(parent, node);
for (int i = 0; i < mySiblings.size(); i++) {
Node el = mySiblings.get(i);
if (el.equals(node)) {
buffer.append('[').append(Integer.toString(i + 1)).append(']');
// Found so break;
break;
}
}
String xPath = buffer.toString();
node.setUserData(FULL_XPATH_CACHE, xPath, null);
return xPath;
}
/**
* Get siblings of the same type as element from parent.
*
* @param parent
* parent node.
* @param element
* element.
* @return List of sibling (from element) under parent
*/
public static List<Node> getSiblings(Node parent, Node element) {
List<Node> result = new ArrayList<Node>();
NodeList list = parent.getChildNodes();
for (int i = 0; i < list.getLength(); i++) {
Node el = list.item(i);
if (el.getNodeName().equals(element.getNodeName())) {
result.add(el);
}
}
return result;
}
/**
* Returns the list of nodes which match the expression xpathExpr in the String domStr.
*
* @return the list of nodes which match the query
* @throws XPathExpressionException
* @throws IOException
*/
public static NodeList evaluateXpathExpression(String domStr, String xpathExpr)
throws XPathExpressionException, IOException {
Document dom = DomUtils.asDocument(domStr);
return evaluateXpathExpression(dom, xpathExpr);
}
/**
* Returns the list of nodes which match the expression xpathExpr in the Document dom.
*
* @param dom
* the Document to search in
* @param xpathExpr
* the xpath query
* @return the list of nodes which match the query
* @throws XPathExpressionException
* On error.
*/
public static NodeList evaluateXpathExpression(Document dom, String xpathExpr)
throws XPathExpressionException {
XPathFactory factory = XPathFactory.newInstance();
XPath xpath = factory.newXPath();
XPathExpression expr = xpath.compile(xpathExpr);
Object result = expr.evaluate(dom, XPathConstants.NODESET);
NodeList nodes = (NodeList) result;
return nodes;
}
/**
* Returns the XPaths of all nodes retrieved by xpathExpression. Example: //DIV[@id='foo']
* returns /HTM[1]/BODY[1]/DIV[2]
*
* @param dom
* The dom.
* @param xpathExpression
* The expression to find the element.
* @return list of XPaths retrieved by xpathExpression.
* @throws XPathExpressionException
*/
public static ImmutableList<String> getXpathForXPathExpressions(Document dom,
String xpathExpression) throws XPathExpressionException {
NodeList nodeList = XPathHelper.evaluateXpathExpression(dom, xpathExpression);
Builder<String> result = ImmutableList.builder();
if (nodeList.getLength() > 0) {
for (int i = 0; i < nodeList.getLength(); i++) {
Node n = nodeList.item(i);
result.add(getXPathExpression(n));
}
}
return result.build();
}
/**
* @param xpath
* The xpath to format.
* @return formatted xpath with tag names in uppercase and attributes in lowercase
*/
public static String formatXPath(String xpath) {
String formatted = capitalizeTagNames(xpath);
formatted = lowerCaseAttributes(formatted);
return formatted;
}
private static String lowerCaseAttributes(String formatted) {
Matcher m = ID_PATTERN.matcher(formatted);
StringBuffer sb = new StringBuffer();
while (m.find()) {
String text = m.group();
m.appendReplacement(sb, Matcher.quoteReplacement(text.toLowerCase()));
}
m.appendTail(sb);
return sb.toString();
}
private static String capitalizeTagNames(String xpath) {
Matcher m = TAG_PATTERN.matcher(xpath);
StringBuffer sb = new StringBuffer();
while (m.find()) {
String text = m.group();
m.appendReplacement(sb, Matcher.quoteReplacement(text.toUpperCase()));
}
m.appendTail(sb);
return sb.toString();
}
/**
* @param xpath
* The xpath expression to find the last element of.
* @return returns the last element in the xpath expression
*/
public static String getLastElementXPath(String xpath) {
String[] elements = xpath.split("/");
for (int i = elements.length - 1; i >= 0; i--) {
if (!elements[i].equals("") && elements[i].indexOf("()") == -1
&& !elements[i].startsWith("@")) {
return stripEndSquareBrackets(elements[i]);
}
}
return "";
}
/**
* @param string
* @return string without the before [
*/
private static String stripEndSquareBrackets(String string) {
if (string.contains("[")) {
return string.substring(0, string.indexOf('['));
} else {
return string;
}
}
/**
* returns position of xpath element which match the expression xpath in the String dom.
*
* @param dom
* the Document to search in
* @param xpath
* the xpath query
* @return position of xpath element, if fails returns -1
**/
public static int getXPathLocation(String dom, String xpath) {
String dom_lower = dom.toLowerCase();
String xpath_lower = xpath.toLowerCase();
String[] elements = xpath_lower.split("/");
int pos = 0;
int temp;
int number;
for (String element : elements) {
if (!element.isEmpty() && !element.startsWith("@") && !element.contains("()")) {
if (element.contains("[")) {
try {
number =
Integer.parseInt(element.substring(element.indexOf("[") + 1,
element.indexOf("]")));
} catch (NumberFormatException e) {
return -1;
}
} else {
number = 1;
}
for (int i = 0; i < number; i++) {
// find new open element
temp = dom_lower.indexOf("<" + stripEndSquareBrackets(element), pos);
if (temp > -1) {
pos = temp + 1;
// if depth>1 then goto end of current element
if (number > 1 && i < number - 1) {
pos =
getCloseElementLocation(dom_lower, pos,
stripEndSquareBrackets(element));
}
}
}
}
}
return pos - 1;
}
/**
* @param dom
* The dom string.
* @param pos
* Position where to start searching.
* @param element
* The element.
* @return the position where the close element is
*/
public static int getCloseElementLocation(String dom, int pos, String element) {
String[] elements = { "LINK", "META", "INPUT", "BR" };
List<String> singleElements = Arrays.asList(elements);
if (singleElements.contains(element.toUpperCase())) {
return dom.indexOf('>', pos) + 1;
}
// make sure not before the node
int openElements = 1;
int i = 0;
int position = pos;
String dom_lower = dom.toLowerCase();
String element_lower = element.toLowerCase();
String openElement = "<" + element_lower;
String closeElement = "</" + element_lower;
while (i < MAX_SEARCH_LOOPS) {
if (dom_lower.indexOf(openElement, position) == -1
&& dom_lower.indexOf(closeElement, position) == -1) {
return -1;
}
if (dom_lower.indexOf(openElement, position) < dom_lower.indexOf(closeElement,
position)
&& dom_lower.indexOf(openElement, position) != -1) {
openElements++;
position = dom_lower.indexOf(openElement, position) + 1;
} else {
openElements--;
position = dom_lower.indexOf(closeElement, position) + 1;
}
if (openElements == 0) {
break;
}
i++;
}
return position - 1;
}
/**
* @param dom
* The dom.
* @param xpath
* The xpath expression.
* @return the position where the close element is
*/
public static int getCloseElementLocation(String dom, String xpath) {
return getCloseElementLocation(dom, getXPathLocation(dom, xpath) + 1,
getLastElementXPath(xpath));
}
/**
* @param xpath
* The xpath expression.
* @return the xpath expression for only the element location. Leaves out the attributes and
* text()
*/
public static String stripXPathToElement(String xpath) {
String xpathStripped = xpath;
if (!Strings.isNullOrEmpty(xpathStripped)) {
if (xpathStripped.toLowerCase().contains("/text()")) {
xpathStripped =
xpathStripped
.substring(0, xpathStripped.toLowerCase().indexOf("/text()"));
}
if (xpathStripped.toLowerCase().contains("/comment()")) {
xpathStripped =
xpathStripped.substring(0,
xpathStripped.toLowerCase().indexOf("/comment()"));
}
if (xpathStripped.contains("@")) {
xpathStripped = xpathStripped.substring(0, xpathStripped.indexOf("@") - 1);
}
}
return xpathStripped;
}
private XPathHelper() {
}
}