/*****************************************************************************
* Copyright (C) Codehaus.org *
* ------------------------------------------------------------------------- *
* Licensed under the Apache License, Version 2.0 (the "License"); *
* you may not use this file except in compliance with the License. *
* You may obtain a copy of the License at *
* *
* http://www.apache.org/licenses/LICENSE-2.0 *
* *
* Unless required by applicable law or agreed to in writing, software *
* distributed under the License is distributed on an "AS IS" BASIS, *
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
* See the License for the specific language governing permissions and *
* limitations under the License. *
*****************************************************************************/
package net.ion.rosetta.pattern;
import java.util.regex.Matcher;
import net.ion.rosetta.util.Checks;
/**
* Provides common {@link Pattern} implementations.
*
* @author Ben Yu
*/
public final class Patterns {
private Patterns() {
}
/** A {@link Pattern} that always returns {@link Pattern#MISMATCH}. */
public static final Pattern NEVER = new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
return Pattern.MISMATCH;
}
};
/** A {@link Pattern} that always matches with match length {@code 0}. */
public static final Pattern ALWAYS = new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
return 0;
}
};
/**
* A {@link Pattern} that matches any character and only mismatches for an
* empty string.
*/
public static final Pattern ANY_CHAR = hasAtLeast(1);
/**
* A {@link Pattern} object that matches if the input has no character left.
* Match length is {@code 0} if succeed.
*/
public static final Pattern EOF = hasExact(0);
/**
* A {@link Pattern} object that succeeds with match length {@code 2} if
* there are at least 2 characters in the input and the first character is
* {@code '\'}. Mismatch otherwise.
*/
public static final Pattern ESCAPED = new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
if (begin >= end - 1)
return Pattern.MISMATCH;
else if (src.charAt(begin) == '\\')
return 2;
else
return Pattern.MISMATCH;
}
};
/** A {@link Pattern} object that matches an integer. */
public static final Pattern INTEGER = many1(CharPredicates.IS_DIGIT);
/**
* A {@link Pattern} object that matches a decimal number that has at least
* one digit before the decimal point. The decimal point and the numbers to
* the right are optional.
*
* <p>
* {@code 0, 11., 2.3} are all good candidates. While {@code .1, .} are not.
*/
public static final Pattern STRICT_DECIMAL = INTEGER.next(isChar('.').next(many(CharPredicates.IS_DIGIT)).optional());
/**
* A {@link Pattern} object that matches a decimal point and one or more
* digits after it.
*/
public static final Pattern FRACTION = isChar('.').next(INTEGER);
/**
* A {@link Pattern} object that matches a decimal number that could start
* with a decimal point or a digit.
*/
public static final Pattern DECIMAL = STRICT_DECIMAL.or(FRACTION);
/**
* A {@link Pattern} object that matches a standard english word, which
* starts with either an underscore or an alpha character, followed by 0 or
* more alphanumeric characters.
*/
public static final Pattern WORD = isChar(CharPredicates.IS_ALPHA_).next(isChar(CharPredicates.IS_ALPHA_NUMERIC_).many());
public static final Pattern ARRAYWORD = isChar('[').next(isChar(CharPredicates.IS_ALPHA_NUMERIC_).many()).next(isChar(']')) ;
/**
* A {@link Pattern} object that matches an octal integer that starts with a
* {@code 0} and is followed by 0 or more {@code [0 - 7]} characters.
*/
public static final Pattern OCT_INTEGER = isChar('0').next(many(CharPredicates.range('0', '7')));
/**
* A {@link Pattern} object that matches a decimal integer, which starts
* with a non-zero digit and is followed by 0 or more digits.
*/
public static final Pattern DEC_INTEGER = sequence(range('1', '9'), many(CharPredicates.IS_DIGIT));
/**
* A {@link Pattern} object that matches a hex integer, which starts with a
* {@code 0x} or {@code 0X}, and is followed by one or more hex digits.
*/
public static final Pattern HEX_INTEGER = string("0x").or(string("0X")).next(many1(CharPredicates.IS_HEX_DIGIT));
/**
* A {@link Pattern} object that matches a scientific notation, such as
* {@code 1e12}, {@code 1.2E-1}, etc.
*/
public static final Pattern SCIENTIFIC_NOTATION = sequence(DECIMAL, among("eE"), among("+-").optional(), INTEGER);
/**
* A {@link Pattern} object that matches any regular expression pattern
* string in the form of {@code /some pattern here/}. {@code '\'} is used as
* escape character.
*/
public static final Pattern REGEXP_PATTERN = getRegularExpressionPattern();
/**
* A {@link Pattern} object that matches regular expression modifiers, which
* is a list of alpha characters.
*/
public static final Pattern REGEXP_MODIFIERS = getModifiersPattern();
/**
* Returns a {@link Pattern} object that matches if the input has at least
* {@code n} characters left. Match length is {@code n} if succeed.
*/
public static Pattern hasAtLeast(final int n) {
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
if (begin + n > end)
return Pattern.MISMATCH;
else
return n;
}
};
}
/**
* Returns a {@link Pattern} object that matches if the input has exactly
* {@code n} characters left. Match length is {@code n} if succeed.
*/
public static Pattern hasExact(final int n) {
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
if (begin + n != end)
return Pattern.MISMATCH;
else
return n;
}
};
}
/**
* Returns a {@link Pattern} object that matches if the current character in
* the input is equal to character {@code c}, in which case {@code 1} is
* returned as match length. Mismatches otherwise.
*/
public static Pattern isChar(char c) {
return isChar(CharPredicates.isChar(c));
}
/**
* Returns a {@link Pattern} object that matches if the current character in
* the input is between character {@code c1} and {@code c2}, in which case
* {@code 1} is returned as match length.
*/
public static Pattern range(char c1, char c2) {
return isChar(CharPredicates.range(c1, c2));
}
/**
* Returns a {@link Pattern} object that matches if the current character in
* the input is equal to any character in {@code chars}, in which case
* {@code 1} is returned as match length.
*/
public static Pattern among(String chars) {
return isChar(CharPredicates.among(chars));
}
/**
* Returns a {@link Pattern} object that matches if the current character in
* the input satisfies {@code predicate}, in which case {@code 1} is
* returned as match length.
*/
public static Pattern isChar(final CharPredicate predicate) {
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
if (begin >= end)
return Pattern.MISMATCH;
else if (predicate.isChar(src.charAt(begin)))
return 1;
else
return Pattern.MISMATCH;
}
};
}
/**
* Returns a {@link Pattern} object that matches a line comment started by
* {@code begin} and ended by {@code EOF} or {@code LF} (the line feed
* character).
*/
public static Pattern lineComment(String begin) {
return string(begin).next(many(CharPredicates.notChar('\n')));
}
/** Returns a {@link Pattern} object that matches {@code string} literally. */
public static Pattern string(final String string) {
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
return matchString(string, src, begin, end);
}
};
}
/**
* Returns a {@link Pattern} object that matches {@code string} case
* insensitively.
*/
public static Pattern stringCaseInsensitive(final String string) {
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
return matchStringCaseInsensitive(string, src, begin, end);
}
};
}
/**
* Returns a {@link Pattern} object that matches if the input has at least 1
* character and doesn't match {@code string}. {@code 1} is returned as
* match length if succeeds.
*/
public static Pattern notString(final String string) {
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
if (begin >= end)
return MISMATCH;
if (matchString(string, src, begin, end) == Pattern.MISMATCH)
return 1;
else
return MISMATCH;
}
};
}
/**
* Returns a {@link Pattern} object that matches if the input has at least 1
* character and doesn't match {@code string} case insensitively. {@code 1}
* is returned as match length if succeeds.
*/
public static Pattern notStringCaseInsensitive(final String string) {
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
if (begin >= end)
return MISMATCH;
if (matchStringCaseInsensitive(string, src, begin, end) == Pattern.MISMATCH)
return 1;
else
return MISMATCH;
}
};
}
private static boolean compareIgnoreCase(char a, char b) {
return Character.toLowerCase(a) == Character.toLowerCase(b);
}
private static int matchString(String str, CharSequence src, int begin, int end) {
final int slen = str.length();
if (end - begin < slen)
return Pattern.MISMATCH;
for (int i = 0; i < slen; i++) {
final char exp = str.charAt(i);
final char enc = src.charAt(begin + i);
if (exp != enc) {
return Pattern.MISMATCH;
}
}
return slen;
}
private static int matchStringCaseInsensitive(String str, CharSequence src, int begin, int end) {
final int slen = str.length();
if (end - begin < slen)
return Pattern.MISMATCH;
for (int i = 0; i < slen; i++) {
final char exp = str.charAt(i);
final char enc = src.charAt(begin + i);
if (!compareIgnoreCase(exp, enc)) {
return Pattern.MISMATCH;
}
}
return slen;
}
static Pattern not(final Pattern pp) {
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
if (pp.match(src, begin, end) != Pattern.MISMATCH)
return Pattern.MISMATCH;
else
return 0;
}
};
}
static Pattern peek(final Pattern pp) {
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
if (pp.match(src, begin, end) == Pattern.MISMATCH)
return Pattern.MISMATCH;
else
return 0;
}
};
}
/**
* Returns a {@link Pattern} that matches if all of {@code patterns}
* matches, in which case, the maximum match length is returned. Mismatch if
* any one mismatches.
*/
public static Pattern and(final Pattern... patterns) {
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
int ret = 0;
for (Pattern pattern : patterns) {
int l = pattern.match(src, begin, end);
if (l == MISMATCH)
return MISMATCH;
if (l > ret)
ret = l;
}
return ret;
}
};
}
/**
* Returns a {@link Pattern} that matches if any of {@code patterns}
* matches, in which case, the first match length is returned. Mismatch if
* any one mismatches.
*/
public static Pattern or(final Pattern... patterns) {
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
for (Pattern pattern : patterns) {
int l = pattern.match(src, begin, end);
if (l != MISMATCH)
return l;
}
return MISMATCH;
}
};
}
/**
* Returns a {@link Pattern} object that matches the input against
* {@code patterns} sequentially. Te total match length is returned if all
* succeed.
*/
public static Pattern sequence(final Pattern... patterns) {
return new Pattern() {
@Override
public int match(final CharSequence src, final int begin, final int end) {
int current = begin;
for (Pattern pattern : patterns) {
int l = pattern.match(src, current, end);
if (l == Pattern.MISMATCH)
return l;
current += l;
}
return current - begin;
}
};
}
/**
* Returns a {@link Pattern} object that matches if the input has at least
* {@code n} characters and the first {@code n} characters all satisfy
* {@code predicate}.
*/
public static Pattern repeat(final int n, final CharPredicate predicate) {
Checks.checkNonNegative(n, "n < 0");
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
return matchRepeat(n, predicate, src, end, begin, 0);
}
};
}
/**
* Returns a {@link Pattern} object that matches if the input has {@code n}
* occurrences of {@code pattern}.
*/
static Pattern repeat(final int n, final Pattern pattern) {
Checks.checkNonNegative(n, "n < 0");
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
return matchRepeat(n, pattern, src, end, begin, 0);
}
};
}
/**
* Returns a {@link Pattern} object that matches if the input starts with
* {@code min} or more characters and all satisfy {@code predicate}.
*/
public static Pattern many(final int min, final CharPredicate predicate) {
Checks.checkMin(min);
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
int minlen = matchRepeat(min, predicate, src, end, begin, 0);
if (minlen == MISMATCH)
return MISMATCH;
return matchMany(predicate, src, end, begin + minlen, minlen);
}
};
}
/**
* Returns a {@link Pattern} that matches 0 or more characters satisfying
* {@code predicate}.
*/
public static Pattern many(final CharPredicate predicate) {
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
return matchMany(predicate, src, end, begin, 0);
}
};
}
static Pattern many(final int min, final Pattern pattern) {
Checks.checkMin(min);
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
int minlen = matchRepeat(min, pattern, src, end, begin, 0);
if (MISMATCH == minlen)
return MISMATCH;
return matchMany(pattern, src, end, begin + minlen, minlen);
}
};
}
static Pattern many(final Pattern pattern) {
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
return matchMany(pattern, src, end, begin, 0);
}
};
}
/**
* Returns a {@link Pattern} that matches at least {@code min} and up to
* {@code max} number of characters satisfying {@code predicate},
*/
public static Pattern some(final int min, final int max, final CharPredicate predicate) {
Checks.checkMinMax(min, max);
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
int minlen = matchRepeat(min, predicate, src, end, begin, 0);
if (minlen == MISMATCH)
return MISMATCH;
return matchSome(max - min, predicate, src, end, begin + minlen, minlen);
}
};
}
/**
* Returns a {@link Pattern} that matches up to {@code max} number of
* characters satisfying {@code predicate}.
*/
public static Pattern some(final int max, final CharPredicate predicate) {
Checks.checkMax(max);
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
return matchSome(max, predicate, src, end, begin, 0);
}
};
}
static Pattern some(final int min, final int max, final Pattern pp) {
Checks.checkMinMax(min, max);
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
int minlen = matchRepeat(min, pp, src, end, begin, 0);
if (MISMATCH == minlen)
return MISMATCH;
return matchSome(max - min, pp, src, end, begin + minlen, minlen);
}
};
}
static Pattern some(final int max, final Pattern pp) {
Checks.checkMax(max);
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
return matchSome(max, pp, src, end, begin, 0);
}
};
}
/**
* Returns a {@link Pattern} that tries both {@code p1} and {@code p2}, and
* picks the one with the longer match length. If both have the same length,
* {@code p1} is favored.
*/
public static Pattern longer(Pattern p1, Pattern p2) {
return longest(p1, p2);
}
/**
* Returns a {@link Pattern} that tries all of {@code patterns}, and picks
* the one with the longest match length. If two patterns have the same
* length, the first one is favored.
*/
public static Pattern longest(final Pattern... patterns) {
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
int r = MISMATCH;
for (Pattern pattern : patterns) {
int l = pattern.match(src, begin, end);
if (l > r)
r = l;
}
return r;
}
};
}
/**
* Returns a {@link Pattern} that tries both {@code p1} and {@code p2}, and
* picks the one with the shorter match length. If both have the same
* length, {@code p1} is favored.
*/
public static Pattern shorter(Pattern p1, Pattern p2) {
return shortest(p1, p2);
}
/**
* Returns a {@link Pattern} that tries all of {@code patterns}, and picks
* the one with the shortest match length. If two patterns have the same
* length, the first one is favored.
*/
public static Pattern shortest(final Pattern... patterns) {
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
int r = MISMATCH;
for (int i = 0; i < patterns.length; i++) {
final int l = patterns[i].match(src, begin, end);
if (l != MISMATCH) {
if (r == MISMATCH || l < r)
r = l;
}
}
return r;
}
};
}
static Pattern ifelse(final Pattern cond, final Pattern consequence, final Pattern alternative) {
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
final int conditionResult = cond.match(src, begin, end);
if (conditionResult == MISMATCH) {
return alternative.match(src, begin, end);
} else {
final int consequenceResult = consequence.match(src, begin + conditionResult, end);
if (consequenceResult == MISMATCH)
return MISMATCH;
else
return conditionResult + consequenceResult;
}
}
};
}
/**
* Returns a {@link Pattern} that matches 1 or more characters satisfying
* {@code predicate}.
*/
public static Pattern many1(CharPredicate predicate) {
return many(1, predicate);
}
/** Adapts a regular expression pattern to a {@link Pattern}. */
public static Pattern regex(final java.util.regex.Pattern p) {
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
if (begin > end)
return Pattern.MISMATCH;
Matcher matcher = p.matcher(src.subSequence(begin, end));
if (matcher.lookingAt())
return matcher.end();
return Pattern.MISMATCH;
}
};
}
/** Adapts a regular expression pattern string to a {@link Pattern}. */
public static Pattern regex(String s) {
return regex(java.util.regex.Pattern.compile(s));
}
static Pattern optional(final Pattern pp) {
return new Pattern() {
@Override
public int match(CharSequence src, int begin, int end) {
final int l = pp.match(src, begin, end);
return (l == Pattern.MISMATCH) ? 0 : l;
}
};
}
private static int matchRepeat(int n, CharPredicate predicate, CharSequence src, int len, int from, int acc) {
int tail = from + n;
if (tail > len)
return Pattern.MISMATCH;
for (int i = from; i < tail; i++) {
if (!predicate.isChar(src.charAt(i)))
return Pattern.MISMATCH;
}
return n + acc;
}
private static int matchRepeat(int n, Pattern pattern, CharSequence src, int len, int from, int acc) {
int end = from;
for (int i = 0; i < n; i++) {
int l = pattern.match(src, end, len);
if (l == Pattern.MISMATCH)
return Pattern.MISMATCH;
end += l;
}
return end - from + acc;
}
private static int matchSome(int max, CharPredicate predicate, CharSequence src, int len, int from, int acc) {
int k = Math.min(max + from, len);
for (int i = from; i < k; i++) {
if (!predicate.isChar(src.charAt(i)))
return i - from + acc;
}
return k - from + acc;
}
private static int matchSome(int max, Pattern pattern, CharSequence src, int len, int from, int acc) {
int begin = from;
for (int i = 0; i < max; i++) {
int l = pattern.match(src, begin, len);
if (Pattern.MISMATCH == l)
return begin - from + acc;
begin += l;
}
return begin - from + acc;
}
private static int matchMany(CharPredicate predicate, CharSequence src, int len, int from, int acc) {
for (int i = from; i < len; i++) {
if (!predicate.isChar(src.charAt(i)))
return i - from + acc;
}
return len - from + acc;
}
private static int matchMany(Pattern pattern, CharSequence src, int len, int from, int acc) {
for (int i = from;;) {
int l = pattern.match(src, i, len);
if (Pattern.MISMATCH == l)
return i - from + acc;
// we simply stop the loop when infinity is found. this may make the
// parser more user-friendly.
if (l == 0)
return i - from + acc;
i += l;
}
}
private static final Pattern getRegularExpressionPattern() {
Pattern quote = isChar('/');
Pattern escape = isChar('\\').next(hasAtLeast(1));
Pattern content = or(escape, isChar(CharPredicates.notAmong("/\r\n\\")));
return quote.next(content.many()).next(quote);
}
private static final Pattern getModifiersPattern() {
return isChar(CharPredicates.IS_ALPHA).many();
}
}