/* * ModeShape (http://www.modeshape.org) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.modeshape.jcr.sequencer; import java.io.Serializable; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import org.modeshape.common.annotation.Immutable; import org.modeshape.common.util.CheckArg; import org.modeshape.common.util.HashCode; import org.modeshape.common.util.ObjectUtil; import org.modeshape.jcr.GraphI18n; /** * An expression that defines an acceptable path using a regular-expression-like language. Path expressions can be used to * represent node paths or properties. * <p> * Let's first look at some simple examples of path expressions: * </p> * <table> * <tr> * <th>Path expression</th> * <th>Description</th> * </tr> * <tr> * <td>/a/b</td> * <td>Match node "<code>b</code>" that is a child of the top level node "<code>a</code>". Neither node may have any * same-name-sibilings.</td> * </tr> * <tr> * <td>/a/*</td> * <td>Match any child node of the top level node "<code>a</code>".</td> * </tr> * <tr> * <td>/a/*.txt</td> * <td>Match any child node of the top level node "<code>a</code>" that also has a name ending in "<code>.txt</code>".</td> * </tr> * <tr> * <td>/a/b@c</td> * <td>Match the property "<code>c</code>" of node "<code>/a/b</code>".</td> * </tr> * <tr> * <td>/a/b[2]</td> * <td>The second child named "<code>b</code>" below the top level node "<code>a</code>".</td> * </tr> * <tr> * <td>/a/b[2,3,4]</td> * <td>The second, third or fourth child named "<code>b</code>" below the top level node "<code>a</code>".</td> * </tr> * <tr> * <td>/a/b[*]</td> * <td>Any (and every) child named "<code>b</code>" below the top level node "<code>a</code>".</td> * </tr> * <tr> * <td>//a/b</td> * <td>Any node named "<code>b</code>" that exists below a node named "<code>a</code>", regardless of where node "<code>a</code>" * occurs. Again, neither node may have any same-name-sibilings.</td> * </tr> * </table> * <p> * With these simple examples, you can probably discern the most important rules. First, the '<code>*</code>' is a wildcard * character that matches any character or sequence of characters in a node's name (or index if appearing in between square * brackets), and can be used in conjunction with other characters (e.g., "<code>*.txt</code>"). * </p> * <p> * Second, square brackets (i.e., '<code>[</code>' and '<code>]</code>') are used to match a node's same-name-sibiling index. You * can put a single non-negative number or a comma-separated list of non-negative numbers. Use '0' to match a node that has no * same-name-sibilings, or any positive number to match the specific same-name-sibling. * </p> * <p> * Third, combining two delimiters (e.g., "<code>//</code>") matches any sequence of nodes, regardless of what their names are or * how many nodes. Often used with other patterns to identify nodes at any level matching other patterns. Three or more sequential * slash characters are treated as two. * </p> * <p> * Many path expressions can be created using just these simple rules. However, input paths can be more complicated. Here are some * more examples: * </p> * <table> * <tr> * <th>Path expressions</th> * <th>Description</th> * </tr> * <tr> * <td>/a/(b|c|d)</td> * <td>Match children of the top level node "<code>a</code>" that are named "<code>a</code>", "<code>b</code>" or "<code>c</code> * ". None of the nodes may have same-name-sibling indexes.</td> * </tr> * <tr> * <td>/a/b[c/d]</td> * <td>Match node "<code>b</code>" child of the top level node "<code>a</code>", when node "<code>b</code>" has a child named " * <code>c</code>", and "<code>c</code>" has a child named "<code>d</code>". Node "<code>b</code> * " is the selected node, while nodes "<code>b</code>" and "<code>b</code>" are used as criteria but are not selected.</td> * </tr> * <tr> * <td>/a(/(b|c|d|)/e)[f/g/@something]</td> * <td>Match node "<code>/a/b/e</code>", "<code>/a/c/e</code>", "<code>/a/d/e</code>", or "<code>/a/e</code> * " when they also have a child "<code>f</code>" that itself has a child "<code>g</code>" with property "<code>something</code>". * None of the nodes may have same-name-sibling indexes.</td> * </tr> * </table> * <p> * These examples show a few more advanced rules. Parentheses (i.e., '<code>(</code>' and '<code>)</code>') can be used to define * a set of options for names, as shown in the first and third rules. Whatever part of the selected node's path appears between * the parentheses is captured for use within the output path. Thus, the first input path in the previous table would match node " * <code>/a/b</code>", and "b" would be captured and could be used within the output path using "<code>$1</code>", where the * number used in the output path identifies the parentheses. * </p> * <p> * Square brackets can also be used to specify criteria on a node's properties or children. Whatever appears in between the square * brackets does not appear in the selected node. * </p> * <h3>Workspace names</h3> * <p> * Path expressions can also specify restrictions on the workspace name to constrain the path expression to matching only paths * from certain workspaces meeting the name criteria. Of course, if the path expression doesn't include these restrictions, the * workspace name are not considered when matching paths. * </p> */ @Immutable public class PathExpression implements Serializable { /** * Initial version */ private static final long serialVersionUID = 1L; /** * Compile the supplied expression and return the resulting path expression instance. * * @param expression the expression * @return the path expression; never null * @throws IllegalArgumentException if the expression is null * @throws InvalidPathExpressionException if the expression is blank or is not a valid expression */ public static final PathExpression compile( String expression ) throws InvalidPathExpressionException { return new PathExpression(expression); } private static final String SEQUENCE_PATTERN_STRING = "\\[(\\d+(?:,\\d+)*)\\]"; // \[(\d+(,\d+)*)\] private static final Pattern SEQUENCE_PATTERN = Pattern.compile(SEQUENCE_PATTERN_STRING); /** * Regular expression used to find unusable XPath predicates within an expression. This pattern results in unusable predicates * in group 1. Note that some predicates may be valid at the end but not valid elsewhere. * <p> * Currently, only index-like predicates (including sequences) are allowed everywhere. Predicates with paths and properties * are allowed only as the last predicate. Predicates with any operators are unused. * </p> * <p> * Nested predicates are not currently allowed. * </p> */ // \[(?:(?:\d+(?:,\d+)*)|\*)\]|(?:\[[^\]\+\-\*=\!><'"\s]+\])$|(\[[^\]]+\]) private static final String UNUSABLE_PREDICATE_PATTERN_STRING = "\\[(?:(?:\\d+(?:,\\d+)*)|\\*)\\]|(?:\\[[^\\]\\+\\-\\*=\\!><'\"\\s]+\\])$|(\\[[^\\]]+\\])"; private static final Pattern UNUSABLE_PREDICATE_PATTERN = Pattern.compile(UNUSABLE_PREDICATE_PATTERN_STRING); /** * Regular expression used to find all XPath predicates except index and sequence patterns. This pattern results in the * predicates to be removed in group 1. */ // \[(?:(?:\d+(?:,\d+)*)|\*)\]|(\[[^\]]+\]) private static final String NON_INDEX_PREDICATE_PATTERN_STRING = "\\[(?:(?:\\d+(?:,\\d+)*)|\\*)\\]|(\\[[^\\]]+\\])"; private static final Pattern NON_INDEX_PREDICATE_PATTERN = Pattern.compile(NON_INDEX_PREDICATE_PATTERN_STRING); /** * The regular expression that is used to extract the workspace name and path from an path expression (or a real path). The * regular expression is <code>(([^:/]*):)?(.*)</code>. Group 2 will contain the workspace name and group 3 the path. */ private static final String WORKSPACE_AND_PATH_PATTERN_STRING = "(([^:/]*):)?(.*)"; private static final Pattern WORKSPACE_AND_PATH_PATTERN = Pattern.compile(WORKSPACE_AND_PATH_PATTERN_STRING); private final String expression; /** * This is the pattern that is used to determine if the particular path is from a particular workspace. This pattern will be * null if the expression does not constrain the workspace name. */ private final Pattern workspacePattern; /** * This is the pattern that is used to determine if there is a match with particular paths. */ private final Pattern matchPattern; /** * This is the pattern that is used to determine which parts of the particular input paths are included in the * {@link Matcher#getSelectedNodePath() selected path}, only after the input path has already matched. */ private final Pattern selectPattern; /** * Create the supplied expression. * * @param expression the expression * @throws IllegalArgumentException if the expression is null * @throws InvalidPathExpressionException if the expression is blank or is not a valid expression */ public PathExpression( String expression ) throws InvalidPathExpressionException { CheckArg.isNotNull(expression, "path expression"); this.expression = expression.trim(); if (this.expression.length() == 0) { throw new InvalidPathExpressionException(GraphI18n.pathExpressionMayNotBeBlank.text()); } // Separate out the repository name, workspace name, and path fragments into separate match patterns ... WorkspacePath repoPath = parsePathInWorkspace(this.expression); if (repoPath == null) { throw new InvalidPathExpressionException(GraphI18n.pathExpressionHasInvalidMatch.text(this.expression, this.expression)); } String workPatternStr = repoPath.workspaceName != null ? repoPath.workspaceName : ".*"; String pathPatternStr = repoPath.path; this.workspacePattern = Pattern.compile(workPatternStr); // Build the repository match pattern ... // Build the match pattern, which determines whether a path matches the condition ... String matchString = pathPatternStr; try { matchString = removeUnusedPredicates(matchString); matchString = replaceXPathPatterns(matchString); this.matchPattern = Pattern.compile(matchString, Pattern.CASE_INSENSITIVE); } catch (PatternSyntaxException e) { String msg = GraphI18n.pathExpressionHasInvalidMatch.text(matchString, this.expression); throw new InvalidPathExpressionException(msg, e); } // Build the select pattern, which determines the path that will be selected ... String selectString = pathPatternStr; try { selectString = removeAllPredicatesExceptIndexes(selectString); selectString = replaceXPathPatterns(selectString); selectString = "(" + selectString + ").*"; // group 1 will have selected path ... this.selectPattern = Pattern.compile(selectString, Pattern.CASE_INSENSITIVE); } catch (PatternSyntaxException e) { String msg = GraphI18n.pathExpressionHasInvalidSelect.text(selectString, this.expression); throw new InvalidPathExpressionException(msg, e); } } /** * @return expression */ public String getExpression() { return expression; } /** * Replace certain XPath patterns that are not used or understood. * * @param expression the input regular expressions string; may not be null * @return the regular expression with all unused XPath patterns removed; never null */ protected String removeUnusedPredicates( String expression ) { assert expression != null; java.util.regex.Matcher matcher = UNUSABLE_PREDICATE_PATTERN.matcher(expression); // CHECKSTYLE IGNORE check FOR NEXT 1 LINES StringBuffer sb = new StringBuffer(); if (matcher.find()) { do { // Remove those predicates that show up in group 1 ... String predicateStr = matcher.group(0); String unusablePredicateStr = matcher.group(1); if (unusablePredicateStr != null) { predicateStr = ""; } matcher.appendReplacement(sb, predicateStr); } while (matcher.find()); matcher.appendTail(sb); expression = sb.toString(); } return expression; } /** * Remove all XPath predicates from the supplied regular expression string. * * @param expression the input regular expressions string; may not be null * @return the regular expression with all XPath predicates removed; never null */ protected String removeAllPredicatesExceptIndexes( String expression ) { assert expression != null; java.util.regex.Matcher matcher = NON_INDEX_PREDICATE_PATTERN.matcher(expression); // CHECKSTYLE IGNORE check FOR NEXT 1 LINES StringBuffer sb = new StringBuffer(); if (matcher.find()) { do { // Remove those predicates that show up in group 1 ... String predicateStr = matcher.group(0); String unusablePredicateStr = matcher.group(1); if (unusablePredicateStr != null) { predicateStr = ""; } matcher.appendReplacement(sb, predicateStr); } while (matcher.find()); matcher.appendTail(sb); expression = sb.toString(); } return expression; } /** * Replace certain XPath patterns, including some predicates, with substrings that are compatible with regular expressions. * * @param expression the input regular expressions string; may not be null * @return the regular expression with XPath patterns replaced with regular expression fragments; never null */ protected String replaceXPathPatterns( String expression ) { assert expression != null; // replace 2 or more sequential '|' characters in an OR expression expression = expression.replaceAll("[\\|]{2,}", "|"); // if there is an empty expression in an OR expression, make the whole segment optional ... // (e.g., "/a/b/(c|)/d" => "a/b(/(c))?/d" expression = expression.replaceAll("/(\\([^|]+)(\\|){2,}([^)]+\\))", "(/$1$2$3)?"); expression = expression.replaceAll("/\\(\\|+([^)]+)\\)", "(?:/($1))?"); expression = expression.replaceAll("/\\((([^|]+)(\\|[^|]+)*)\\|+\\)", "(?:/($1))?"); // // Allow any path (that doesn't contain an explicit counter) to contain a counter, // // done by replacing any '/' or '|' that isn't preceded by ']' or '*' or '/' or '(' with '(\[\d+\])?/'... // input = input.replaceAll("(?<=[^\\]\\*/(])([/|])", "(?:\\\\[\\\\d+\\\\])?$1"); // Does the path contain any '[]' or '[*]' or '[0]' or '[n]' (where n is any positive integers)... // '[*]/' => '(\[\d+\])?/' expression = expression.replaceAll("\\[\\]", "(?:\\\\[\\\\d+\\\\])?"); // index is optional // '[]/' => '(\[\d+\])?/' expression = expression.replaceAll("\\[[*]\\]", "(?:\\\\[\\\\d+\\\\])?"); // index is optional // '[0]/' => '(\[0\])?/' expression = expression.replaceAll("\\[0\\]", "(?:\\\\[0\\\\])?"); // index is optional // '[n]/' => '\[n\]/' expression = expression.replaceAll("\\[([1-9]\\d*)\\]", "\\\\[$1\\\\]"); // index is required // Change any other end predicates to not be wrapped by braces but to begin with a slash ... // ...'[x]' => ...'/x' expression = expression.replaceAll("(?<!\\\\)\\[([^\\]]*)\\]$", "/$1"); // Replace all '[n,m,o,p]' type sequences with '[(n|m|o|p)]' java.util.regex.Matcher matcher = SEQUENCE_PATTERN.matcher(expression); // CHECKSTYLE IGNORE check FOR NEXT 1 LINES StringBuffer sb = new StringBuffer(); boolean result = matcher.find(); if (result) { do { String sequenceStr = matcher.group(1); boolean optional = false; if (sequenceStr.startsWith("0,")) { sequenceStr = sequenceStr.replaceFirst("^0,", ""); optional = true; } if (sequenceStr.endsWith(",0")) { sequenceStr = sequenceStr.replaceFirst(",0$", ""); optional = true; } if (sequenceStr.contains(",0,")) { sequenceStr = sequenceStr.replaceAll(",0,", ","); optional = true; } sequenceStr = sequenceStr.replaceAll(",", "|"); String replacement = "\\\\[(?:" + sequenceStr + ")\\\\]"; if (optional) { replacement = "(?:" + replacement + ")?"; } matcher.appendReplacement(sb, replacement); result = matcher.find(); } while (result); matcher.appendTail(sb); expression = sb.toString(); } // Order is important here expression = expression.replaceAll("[*]([^/(\\\\])", "[^/]*$1"); // '*' not followed by '/', '\\', or '(' expression = expression.replaceAll("(?<!\\[\\^/\\])[*]", "[^/]*"); // '*' not preceded by '[^/]' expression = expression.replaceAll("[/]{2,}$", "(?:/[^/]*)*"); // ending '//' expression = expression.replaceAll("[/]{2,}", "(?:/[^/]*)*/"); // other '//' return expression; } /** * @return the expression */ public String getSelectExpression() { return this.expression; } /** * {@inheritDoc} */ @Override public int hashCode() { return this.expression.hashCode(); } /** * {@inheritDoc} */ @Override public boolean equals( Object obj ) { if (obj == this) return true; if (obj instanceof PathExpression) { PathExpression that = (PathExpression)obj; if (!this.expression.equalsIgnoreCase(that.expression)) return false; return true; } return false; } /** * {@inheritDoc} */ @Override public String toString() { return this.expression; } /** * Determine if this path expression applies to content within the supplied workspace name. * * @param workspaceName the name of the workspace; may not be null * @return true if this path expression matches the workspace name, or false otherwise */ public boolean matchesWorkspace( String workspaceName ) { return workspacePattern.matcher(workspaceName).matches(); } /** * Obtain a Matcher that can be used to convert the supplied absolute path into an output repository, and output workspace * name, and output path. Before this method is called, be sure that the workspace for the supplied path also * {@link #matchesWorkspace(String) matches}. * * @param absolutePath the path in the workspace * @return the matcher; never null */ public final Matcher matcher( final String absolutePath ) { String path = absolutePath; // Remove all trailing '/' for proper matching ... path = path.replaceAll("/+$", ""); // See if the supplied absolute path matches the pattern ... final java.util.regex.Matcher matcher = this.matchPattern.matcher(path); if (!matcher.matches()) { // No match, so return immediately ... return new Matcher(matcher, absolutePath, null, null); } // The absolute path does match the pattern, so use the select pattern and try to grab the selected path ... final java.util.regex.Matcher selectMatcher = this.selectPattern.matcher(path); if (!selectMatcher.matches()) { // Nothing can be selected, so return immediately ... return new Matcher(matcher, null, null, null); } // Grab the selected path ... String selectedPath = selectMatcher.group(1); // Remove the trailing '/@property' ... selectedPath = selectedPath.replaceAll("/@[^/\\[\\]]+$", ""); return new Matcher(matcher, absolutePath, null, selectedPath); } @Immutable public static class Matcher { private final String inputPath; private final String selectedWorkspace; private final String selectedPath; private final java.util.regex.Matcher inputMatcher; private final int hc; protected Matcher( java.util.regex.Matcher inputMatcher, String inputPath, String selectedWorkspace, String selectedPath ) { this.inputMatcher = inputMatcher; if (selectedPath != null) selectedPath = selectedPath.replaceAll("[/]*$", ""); this.inputPath = inputPath; this.selectedWorkspace = selectedWorkspace == null || selectedWorkspace.length() == 0 ? null : selectedWorkspace; this.selectedPath = selectedPath; this.hc = HashCode.compute(this.inputPath, this.selectedPath); } public boolean matches() { return this.inputMatcher != null && this.selectedPath != null; } /** * @return inputPath */ public String getInputPath() { return this.inputPath; } /** * @return selectPattern */ public String getSelectedNodePath() { return this.selectedPath; } /** * Get the name of the selected workspace. * * @return the workspace name, or null if there is none specified */ public String getSelectedWorkspaceName() { return this.selectedWorkspace; } public int groupCount() { if (this.inputMatcher == null) return 0; return this.inputMatcher.groupCount(); } public String group( int groupNumber ) { return this.inputMatcher.group(groupNumber); } /** * {@inheritDoc} */ @Override public int hashCode() { return this.hc; } /** * {@inheritDoc} */ @Override public boolean equals( Object obj ) { if (obj == this) return true; if (obj instanceof PathExpression.Matcher) { PathExpression.Matcher that = (PathExpression.Matcher)obj; if (!this.inputPath.equalsIgnoreCase(that.inputPath)) return false; if (!this.selectedPath.equalsIgnoreCase(that.selectedPath)) return false; return true; } return false; } /** * {@inheritDoc} */ @Override public String toString() { return this.selectedPath; } } /** * Regular expression used to determine if the expression matches any single-level wildcard. */ // /*(?:[*.](?:\[\*?\])?/*)* private static final String ANYTHING_PATTERN_STRING = "/*(?:[*.](?:\\[\\*?\\])?/*)*"; private static final Pattern ANYTHING_PATTERN = Pattern.compile(ANYTHING_PATTERN_STRING); /** * Return whether this expression matches anything and therefore is not restrictive. These include expressions of any nodes (" * <code>/</code>"), any sequence of nodes ("<code>//</code>"), the self reference ("<code>.</code>"), or wildcard (" * <code>*</code>", "<code>*[]</code>" or "<code>*[*]</code>"). Combinations of these individual expressions are also * considered to match anything. * * @return true if the expression matches anything, or false otherwise */ public boolean matchesAnything() { return ANYTHING_PATTERN.matcher(expression).matches(); } public static PathExpression all() { return ALL_PATHS_EXPRESSION; } private static final PathExpression ALL_PATHS_EXPRESSION = PathExpression.compile("//"); /** * Parse a path of the form <code>{workspaceName}:{absolutePath}</code> or <code>{absolutePath}</code>. * * @param path the path * @return the workspace path, or null if the supplied path doesn't match any of the path patterns */ public static WorkspacePath parsePathInWorkspace( String path ) { // Extract the workspace name and absPath from the supplied path ... java.util.regex.Matcher pathMatcher = WORKSPACE_AND_PATH_PATTERN.matcher(path); if (!pathMatcher.matches()) { // No match ... return null; } String workspaceName = pathMatcher.group(2); String absolutePath = pathMatcher.group(3); if (workspaceName == null || workspaceName.length() == 0 || workspaceName.trim().length() == 0) workspaceName = null; return new WorkspacePath(workspaceName, absolutePath); } @Immutable public static class WorkspacePath { public final String workspaceName; public final String path; public WorkspacePath( String workspaceName, String path ) { this.workspaceName = workspaceName; this.path = path; } /** * {@inheritDoc} * * @see java.lang.Object#hashCode() */ @Override public int hashCode() { return path.hashCode(); } /** * {@inheritDoc} * * @see java.lang.Object#equals(java.lang.Object) */ @Override public boolean equals( Object obj ) { if (obj == this) return true; if (obj instanceof WorkspacePath) { WorkspacePath that = (WorkspacePath)obj; if (!ObjectUtil.isEqualWithNulls(this.workspaceName, that.workspaceName)) return false; return this.path.equals(that.path); } return false; } /** * {@inheritDoc} * * @see java.lang.Object#toString() */ @Override public String toString() { return (workspaceName != null ? workspaceName : "") + ":" + path; } public WorkspacePath withWorkspaceName( String workspaceName ) { return new WorkspacePath(workspaceName, path); } public WorkspacePath withPath( String path ) { return new WorkspacePath(workspaceName, path); } } }