/***************************************************************************** * Copyright (C) Codehaus.org * * ------------------------------------------------------------------------- * * Licensed under the Apache License, Version 2.0 (the "License"); * * you may not use this file except in compliance with the License. * * You may obtain a copy of the License at * * * * http://www.apache.org/licenses/LICENSE-2.0 * * * * Unless required by applicable law or agreed to in writing, software * * distributed under the License is distributed on an "AS IS" BASIS, * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * * See the License for the specific language governing permissions and * * limitations under the License. * *****************************************************************************/ package net.ion.rosetta.pattern; import java.util.regex.Matcher; import net.ion.rosetta.util.Checks; /** * Provides common {@link Pattern} implementations. * * @author Ben Yu */ public final class Patterns { private Patterns() { } /** A {@link Pattern} that always returns {@link Pattern#MISMATCH}. */ public static final Pattern NEVER = new Pattern() { @Override public int match(CharSequence src, int begin, int end) { return Pattern.MISMATCH; } }; /** A {@link Pattern} that always matches with match length {@code 0}. */ public static final Pattern ALWAYS = new Pattern() { @Override public int match(CharSequence src, int begin, int end) { return 0; } }; /** * A {@link Pattern} that matches any character and only mismatches for an * empty string. */ public static final Pattern ANY_CHAR = hasAtLeast(1); /** * A {@link Pattern} object that matches if the input has no character left. * Match length is {@code 0} if succeed. */ public static final Pattern EOF = hasExact(0); /** * A {@link Pattern} object that succeeds with match length {@code 2} if * there are at least 2 characters in the input and the first character is * {@code '\'}. Mismatch otherwise. */ public static final Pattern ESCAPED = new Pattern() { @Override public int match(CharSequence src, int begin, int end) { if (begin >= end - 1) return Pattern.MISMATCH; else if (src.charAt(begin) == '\\') return 2; else return Pattern.MISMATCH; } }; /** A {@link Pattern} object that matches an integer. */ public static final Pattern INTEGER = many1(CharPredicates.IS_DIGIT); /** * A {@link Pattern} object that matches a decimal number that has at least * one digit before the decimal point. The decimal point and the numbers to * the right are optional. * * <p> * {@code 0, 11., 2.3} are all good candidates. While {@code .1, .} are not. */ public static final Pattern STRICT_DECIMAL = INTEGER.next(isChar('.').next(many(CharPredicates.IS_DIGIT)).optional()); /** * A {@link Pattern} object that matches a decimal point and one or more * digits after it. */ public static final Pattern FRACTION = isChar('.').next(INTEGER); /** * A {@link Pattern} object that matches a decimal number that could start * with a decimal point or a digit. */ public static final Pattern DECIMAL = STRICT_DECIMAL.or(FRACTION); /** * A {@link Pattern} object that matches a standard english word, which * starts with either an underscore or an alpha character, followed by 0 or * more alphanumeric characters. */ public static final Pattern WORD = isChar(CharPredicates.IS_ALPHA_).next(isChar(CharPredicates.IS_ALPHA_NUMERIC_).many()); public static final Pattern ARRAYWORD = isChar('[').next(isChar(CharPredicates.IS_ALPHA_NUMERIC_).many()).next(isChar(']')) ; /** * A {@link Pattern} object that matches an octal integer that starts with a * {@code 0} and is followed by 0 or more {@code [0 - 7]} characters. */ public static final Pattern OCT_INTEGER = isChar('0').next(many(CharPredicates.range('0', '7'))); /** * A {@link Pattern} object that matches a decimal integer, which starts * with a non-zero digit and is followed by 0 or more digits. */ public static final Pattern DEC_INTEGER = sequence(range('1', '9'), many(CharPredicates.IS_DIGIT)); /** * A {@link Pattern} object that matches a hex integer, which starts with a * {@code 0x} or {@code 0X}, and is followed by one or more hex digits. */ public static final Pattern HEX_INTEGER = string("0x").or(string("0X")).next(many1(CharPredicates.IS_HEX_DIGIT)); /** * A {@link Pattern} object that matches a scientific notation, such as * {@code 1e12}, {@code 1.2E-1}, etc. */ public static final Pattern SCIENTIFIC_NOTATION = sequence(DECIMAL, among("eE"), among("+-").optional(), INTEGER); /** * A {@link Pattern} object that matches any regular expression pattern * string in the form of {@code /some pattern here/}. {@code '\'} is used as * escape character. */ public static final Pattern REGEXP_PATTERN = getRegularExpressionPattern(); /** * A {@link Pattern} object that matches regular expression modifiers, which * is a list of alpha characters. */ public static final Pattern REGEXP_MODIFIERS = getModifiersPattern(); /** * Returns a {@link Pattern} object that matches if the input has at least * {@code n} characters left. Match length is {@code n} if succeed. */ public static Pattern hasAtLeast(final int n) { return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { if (begin + n > end) return Pattern.MISMATCH; else return n; } }; } /** * Returns a {@link Pattern} object that matches if the input has exactly * {@code n} characters left. Match length is {@code n} if succeed. */ public static Pattern hasExact(final int n) { return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { if (begin + n != end) return Pattern.MISMATCH; else return n; } }; } /** * Returns a {@link Pattern} object that matches if the current character in * the input is equal to character {@code c}, in which case {@code 1} is * returned as match length. Mismatches otherwise. */ public static Pattern isChar(char c) { return isChar(CharPredicates.isChar(c)); } /** * Returns a {@link Pattern} object that matches if the current character in * the input is between character {@code c1} and {@code c2}, in which case * {@code 1} is returned as match length. */ public static Pattern range(char c1, char c2) { return isChar(CharPredicates.range(c1, c2)); } /** * Returns a {@link Pattern} object that matches if the current character in * the input is equal to any character in {@code chars}, in which case * {@code 1} is returned as match length. */ public static Pattern among(String chars) { return isChar(CharPredicates.among(chars)); } /** * Returns a {@link Pattern} object that matches if the current character in * the input satisfies {@code predicate}, in which case {@code 1} is * returned as match length. */ public static Pattern isChar(final CharPredicate predicate) { return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { if (begin >= end) return Pattern.MISMATCH; else if (predicate.isChar(src.charAt(begin))) return 1; else return Pattern.MISMATCH; } }; } /** * Returns a {@link Pattern} object that matches a line comment started by * {@code begin} and ended by {@code EOF} or {@code LF} (the line feed * character). */ public static Pattern lineComment(String begin) { return string(begin).next(many(CharPredicates.notChar('\n'))); } /** Returns a {@link Pattern} object that matches {@code string} literally. */ public static Pattern string(final String string) { return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { return matchString(string, src, begin, end); } }; } /** * Returns a {@link Pattern} object that matches {@code string} case * insensitively. */ public static Pattern stringCaseInsensitive(final String string) { return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { return matchStringCaseInsensitive(string, src, begin, end); } }; } /** * Returns a {@link Pattern} object that matches if the input has at least 1 * character and doesn't match {@code string}. {@code 1} is returned as * match length if succeeds. */ public static Pattern notString(final String string) { return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { if (begin >= end) return MISMATCH; if (matchString(string, src, begin, end) == Pattern.MISMATCH) return 1; else return MISMATCH; } }; } /** * Returns a {@link Pattern} object that matches if the input has at least 1 * character and doesn't match {@code string} case insensitively. {@code 1} * is returned as match length if succeeds. */ public static Pattern notStringCaseInsensitive(final String string) { return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { if (begin >= end) return MISMATCH; if (matchStringCaseInsensitive(string, src, begin, end) == Pattern.MISMATCH) return 1; else return MISMATCH; } }; } private static boolean compareIgnoreCase(char a, char b) { return Character.toLowerCase(a) == Character.toLowerCase(b); } private static int matchString(String str, CharSequence src, int begin, int end) { final int slen = str.length(); if (end - begin < slen) return Pattern.MISMATCH; for (int i = 0; i < slen; i++) { final char exp = str.charAt(i); final char enc = src.charAt(begin + i); if (exp != enc) { return Pattern.MISMATCH; } } return slen; } private static int matchStringCaseInsensitive(String str, CharSequence src, int begin, int end) { final int slen = str.length(); if (end - begin < slen) return Pattern.MISMATCH; for (int i = 0; i < slen; i++) { final char exp = str.charAt(i); final char enc = src.charAt(begin + i); if (!compareIgnoreCase(exp, enc)) { return Pattern.MISMATCH; } } return slen; } static Pattern not(final Pattern pp) { return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { if (pp.match(src, begin, end) != Pattern.MISMATCH) return Pattern.MISMATCH; else return 0; } }; } static Pattern peek(final Pattern pp) { return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { if (pp.match(src, begin, end) == Pattern.MISMATCH) return Pattern.MISMATCH; else return 0; } }; } /** * Returns a {@link Pattern} that matches if all of {@code patterns} * matches, in which case, the maximum match length is returned. Mismatch if * any one mismatches. */ public static Pattern and(final Pattern... patterns) { return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { int ret = 0; for (Pattern pattern : patterns) { int l = pattern.match(src, begin, end); if (l == MISMATCH) return MISMATCH; if (l > ret) ret = l; } return ret; } }; } /** * Returns a {@link Pattern} that matches if any of {@code patterns} * matches, in which case, the first match length is returned. Mismatch if * any one mismatches. */ public static Pattern or(final Pattern... patterns) { return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { for (Pattern pattern : patterns) { int l = pattern.match(src, begin, end); if (l != MISMATCH) return l; } return MISMATCH; } }; } /** * Returns a {@link Pattern} object that matches the input against * {@code patterns} sequentially. Te total match length is returned if all * succeed. */ public static Pattern sequence(final Pattern... patterns) { return new Pattern() { @Override public int match(final CharSequence src, final int begin, final int end) { int current = begin; for (Pattern pattern : patterns) { int l = pattern.match(src, current, end); if (l == Pattern.MISMATCH) return l; current += l; } return current - begin; } }; } /** * Returns a {@link Pattern} object that matches if the input has at least * {@code n} characters and the first {@code n} characters all satisfy * {@code predicate}. */ public static Pattern repeat(final int n, final CharPredicate predicate) { Checks.checkNonNegative(n, "n < 0"); return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { return matchRepeat(n, predicate, src, end, begin, 0); } }; } /** * Returns a {@link Pattern} object that matches if the input has {@code n} * occurrences of {@code pattern}. */ static Pattern repeat(final int n, final Pattern pattern) { Checks.checkNonNegative(n, "n < 0"); return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { return matchRepeat(n, pattern, src, end, begin, 0); } }; } /** * Returns a {@link Pattern} object that matches if the input starts with * {@code min} or more characters and all satisfy {@code predicate}. */ public static Pattern many(final int min, final CharPredicate predicate) { Checks.checkMin(min); return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { int minlen = matchRepeat(min, predicate, src, end, begin, 0); if (minlen == MISMATCH) return MISMATCH; return matchMany(predicate, src, end, begin + minlen, minlen); } }; } /** * Returns a {@link Pattern} that matches 0 or more characters satisfying * {@code predicate}. */ public static Pattern many(final CharPredicate predicate) { return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { return matchMany(predicate, src, end, begin, 0); } }; } static Pattern many(final int min, final Pattern pattern) { Checks.checkMin(min); return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { int minlen = matchRepeat(min, pattern, src, end, begin, 0); if (MISMATCH == minlen) return MISMATCH; return matchMany(pattern, src, end, begin + minlen, minlen); } }; } static Pattern many(final Pattern pattern) { return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { return matchMany(pattern, src, end, begin, 0); } }; } /** * Returns a {@link Pattern} that matches at least {@code min} and up to * {@code max} number of characters satisfying {@code predicate}, */ public static Pattern some(final int min, final int max, final CharPredicate predicate) { Checks.checkMinMax(min, max); return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { int minlen = matchRepeat(min, predicate, src, end, begin, 0); if (minlen == MISMATCH) return MISMATCH; return matchSome(max - min, predicate, src, end, begin + minlen, minlen); } }; } /** * Returns a {@link Pattern} that matches up to {@code max} number of * characters satisfying {@code predicate}. */ public static Pattern some(final int max, final CharPredicate predicate) { Checks.checkMax(max); return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { return matchSome(max, predicate, src, end, begin, 0); } }; } static Pattern some(final int min, final int max, final Pattern pp) { Checks.checkMinMax(min, max); return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { int minlen = matchRepeat(min, pp, src, end, begin, 0); if (MISMATCH == minlen) return MISMATCH; return matchSome(max - min, pp, src, end, begin + minlen, minlen); } }; } static Pattern some(final int max, final Pattern pp) { Checks.checkMax(max); return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { return matchSome(max, pp, src, end, begin, 0); } }; } /** * Returns a {@link Pattern} that tries both {@code p1} and {@code p2}, and * picks the one with the longer match length. If both have the same length, * {@code p1} is favored. */ public static Pattern longer(Pattern p1, Pattern p2) { return longest(p1, p2); } /** * Returns a {@link Pattern} that tries all of {@code patterns}, and picks * the one with the longest match length. If two patterns have the same * length, the first one is favored. */ public static Pattern longest(final Pattern... patterns) { return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { int r = MISMATCH; for (Pattern pattern : patterns) { int l = pattern.match(src, begin, end); if (l > r) r = l; } return r; } }; } /** * Returns a {@link Pattern} that tries both {@code p1} and {@code p2}, and * picks the one with the shorter match length. If both have the same * length, {@code p1} is favored. */ public static Pattern shorter(Pattern p1, Pattern p2) { return shortest(p1, p2); } /** * Returns a {@link Pattern} that tries all of {@code patterns}, and picks * the one with the shortest match length. If two patterns have the same * length, the first one is favored. */ public static Pattern shortest(final Pattern... patterns) { return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { int r = MISMATCH; for (int i = 0; i < patterns.length; i++) { final int l = patterns[i].match(src, begin, end); if (l != MISMATCH) { if (r == MISMATCH || l < r) r = l; } } return r; } }; } static Pattern ifelse(final Pattern cond, final Pattern consequence, final Pattern alternative) { return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { final int conditionResult = cond.match(src, begin, end); if (conditionResult == MISMATCH) { return alternative.match(src, begin, end); } else { final int consequenceResult = consequence.match(src, begin + conditionResult, end); if (consequenceResult == MISMATCH) return MISMATCH; else return conditionResult + consequenceResult; } } }; } /** * Returns a {@link Pattern} that matches 1 or more characters satisfying * {@code predicate}. */ public static Pattern many1(CharPredicate predicate) { return many(1, predicate); } /** Adapts a regular expression pattern to a {@link Pattern}. */ public static Pattern regex(final java.util.regex.Pattern p) { return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { if (begin > end) return Pattern.MISMATCH; Matcher matcher = p.matcher(src.subSequence(begin, end)); if (matcher.lookingAt()) return matcher.end(); return Pattern.MISMATCH; } }; } /** Adapts a regular expression pattern string to a {@link Pattern}. */ public static Pattern regex(String s) { return regex(java.util.regex.Pattern.compile(s)); } static Pattern optional(final Pattern pp) { return new Pattern() { @Override public int match(CharSequence src, int begin, int end) { final int l = pp.match(src, begin, end); return (l == Pattern.MISMATCH) ? 0 : l; } }; } private static int matchRepeat(int n, CharPredicate predicate, CharSequence src, int len, int from, int acc) { int tail = from + n; if (tail > len) return Pattern.MISMATCH; for (int i = from; i < tail; i++) { if (!predicate.isChar(src.charAt(i))) return Pattern.MISMATCH; } return n + acc; } private static int matchRepeat(int n, Pattern pattern, CharSequence src, int len, int from, int acc) { int end = from; for (int i = 0; i < n; i++) { int l = pattern.match(src, end, len); if (l == Pattern.MISMATCH) return Pattern.MISMATCH; end += l; } return end - from + acc; } private static int matchSome(int max, CharPredicate predicate, CharSequence src, int len, int from, int acc) { int k = Math.min(max + from, len); for (int i = from; i < k; i++) { if (!predicate.isChar(src.charAt(i))) return i - from + acc; } return k - from + acc; } private static int matchSome(int max, Pattern pattern, CharSequence src, int len, int from, int acc) { int begin = from; for (int i = 0; i < max; i++) { int l = pattern.match(src, begin, len); if (Pattern.MISMATCH == l) return begin - from + acc; begin += l; } return begin - from + acc; } private static int matchMany(CharPredicate predicate, CharSequence src, int len, int from, int acc) { for (int i = from; i < len; i++) { if (!predicate.isChar(src.charAt(i))) return i - from + acc; } return len - from + acc; } private static int matchMany(Pattern pattern, CharSequence src, int len, int from, int acc) { for (int i = from;;) { int l = pattern.match(src, i, len); if (Pattern.MISMATCH == l) return i - from + acc; // we simply stop the loop when infinity is found. this may make the // parser more user-friendly. if (l == 0) return i - from + acc; i += l; } } private static final Pattern getRegularExpressionPattern() { Pattern quote = isChar('/'); Pattern escape = isChar('\\').next(hasAtLeast(1)); Pattern content = or(escape, isChar(CharPredicates.notAmong("/\r\n\\"))); return quote.next(content.many()).next(quote); } private static final Pattern getModifiersPattern() { return isChar(CharPredicates.IS_ALPHA).many(); } }