package ecologylab.bigsemantics.documentparsers; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import ecologylab.bigsemantics.actions.SemanticsConstants; import ecologylab.collections.Scope; /** * Amends XPaths for 1) fixing potential problems with extraction, such as converting absolute paths * to relative paths, and 2) enhancing XPath functionalities. * * This class should be stateless and thread safe, for reusing between threads. * * @author quyin */ public class XPathAmender implements SemanticsConstants { public static final String LOOP_VAR = "$i"; static Logger logger = LoggerFactory.getLogger(XPathAmender.class); public String amend(String xpath, Scope<Object> params) { String result = xpath; if (result != null) { result = absoluteToRelative(result); result = assignLoopVariables(result, params); result = joinLines(result); } if (result != xpath && !result.equals(xpath)) { logger.debug("Amended xpath \"{}\" to \"{}\"", xpath, result); } return result; } /** * To prevent infinite loop when a type refers to itself, e.g. * <google_patent>.<references> * * @param xpath * @return */ protected String absoluteToRelative(String xpath) { // in the beginning if (xpath.startsWith("/")) { xpath = "." + xpath; } // can also be like "(//xpath1) or (//xpath2)". if (xpath.contains("(/")) { xpath = xpath.replace("(/", "(./"); } // TODO with more and more cases, eventually we may need to fully parse xpaths to amend it. return xpath; } protected String assignLoopVariables(String xpath, Scope<Object> params) { if (xpath.contains(LOOP_VAR)) { int elementIndex = (Integer) params.get(ELEMENT_INDEX_IN_COLLECTION); xpath = xpath.replace(LOOP_VAR, String.valueOf(elementIndex + 1)); } return xpath; } protected String joinLines(String xpath) { if (xpath.contains("\n") || xpath.contains("\r")) { xpath = xpath.replace("\n", "").replace("\r", ""); } return xpath; } }