//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.google.gdata.util.parser;
import java.util.*;
/**
* The <code>Chset</code> (character set) parser matches the current character
* in the parse buffer against an arbitrary character set. The character set is
* represented as a sorted array of ranges for which a match should be
* successful. Matching takes O(log nranges) time. There are predefined
* character sets for matching any character (<code>ANYCHAR</code>), no
* characters (<code>NOTHING</code>) and some standard 7-bit ASCII ranges
* (<code>ALNUM</code>, <code>ALPHA</code>, <code>DIGIT</code>,
* <code>XDIGIT</code>, <code>LOWER</code>, <code>UPPER</code>,
* <code>WHITESPACE</code>), and <code>ASCII</code>.
*
* Note that the character set parser only matches a single character of the
* parse buffer. The <code>Sequence</code> or </code>Repeat</code> parsers need
* to be used to match more than one character.
*
* The following matches vowels and digits:
*
* Parser p = new Chset("uoiea0-9");
* p.parse("a") -> matches "a"
* p.parse("3") -> matches "3"
* p.parse("b") -> no match
*
* @see Parser
*
*/
public class Chset extends Parser<Object> implements Cloneable {
protected static final char MIN_CHAR = 0;
protected static final char MAX_CHAR = 65535;
private static final char MAX_ASCII_CHAR = 127;
public static final Chset ANYCHAR = new Chset(MIN_CHAR, MAX_CHAR);
public static final Chset NOTHING = new Chset();
public static final Chset ALNUM = new Chset("a-zA-Z0-9");
public static final Chset ALPHA = new Chset("a-zA-Z");
public static final Chset DIGIT = new Chset("0-9");
public static final Chset XDIGIT = new Chset("0-9a-fA-F");
public static final Chset LOWER = new Chset("a-z");
public static final Chset UPPER = new Chset("A-Z");
public static final Chset WHITESPACE = new Chset(" \t\r\n\f");
public static final Chset ASCII = new Chset(MIN_CHAR, MAX_ASCII_CHAR);
private ArrayList<Range> ranges = new ArrayList<Range>();
/**
* A secondary representation for ASCII members of the character set.
* Maintaining this bitmap allows us to check ASCII characters for set
* membership quickly.
*/
private BitSet asciiSet = new BitSet(MAX_ASCII_CHAR + 1);
/**
* Class constructor for an empty character set.
*/
public Chset() {
}
/**
* Class constructor for a character literal.
*
* @param ch The character literal for this character set to match against.
*/
public Chset(char ch) {
this(ch, ch);
}
/**
* Class constructor for a single character range. The range is inclusive:
* all character including <code>min</code> and <code>max</code> match.
*
* @param min The beginning of the character range.
* @param max The end of the character range.
*/
public Chset(char min, char max) {
ranges.add(new Range(min, max));
refreshAsciiSet();
}
/**
* Class constructor that initializes a <code>Chset</code> from a string
* specification.
*
* @param spec The string specification to intialize the <code>Chset</code>
* from.
*/
public Chset(String spec) {
for (int i = 0; i < spec.length();) {
char s = spec.charAt(i);
if ((i + 1) < spec.length()) {
char n = spec.charAt(i + 1);
if (n == '-') {
if ((i + 2) < spec.length()) {
char e = spec.charAt(i + 2);
set(new Range(s, e));
i += 3;
continue;
} else {
set(new Range(s, s));
set(new Range('-', '-'));
break;
}
}
}
set(new Range(s, s));
i += 1;
}
}
/**
* Returns a clone character set of <code>this</code>.
*/
@Override
public Object clone() {
Chset n = new Chset();
for (Range r : ranges) {
n.ranges.add(new Range(r.first, r.last));
}
n.refreshAsciiSet();
return n;
}
/**
* Matches <code>buf[start]</code> against the character set.
*
* @see Parser#parse
*/
@Override
public int parse(char[] buf, int start, int end, Object data) {
if ((start < end) && test(buf[start])) {
return 1;
}
return NO_MATCH;
}
/**
* Tests to see if a single character matches the character set.
*
* @param ch The character to test.
*/
public boolean test(char ch) {
if (ch <= MAX_ASCII_CHAR) {
return asciiSet.get(ch);
}
return testRanges(ch);
}
/**
* Tests to see if a single character matches the character set, but only
* looks at the ranges representation.
*
* @param ch The character to test.
*/
protected boolean testRanges(char ch) {
int range_size = ranges.size();
if (range_size == 0) {
return false;
} else if (range_size == 1) {
// Optimization for a common simple case -- we don't need to do a find().
return ranges.get(0).includes(ch);
} else {
int pos = find(ch);
// We need to test both the range at the position the character would be
// inserted at and the preceding range due to the semantics of find().
// For example, if the Chset contains a single range of [10-19], then
// find() will return 1 for the range [11-11] and we'll want to test
// against 'pos - 1'.
if ((pos != range_size) && ranges.get(pos).includes(ch)) {
return true;
}
if ((pos != 0) && ranges.get(pos - 1).includes(ch)) {
return true;
}
return false;
}
}
/**
* @see #set
*/
protected void set(char min, char max) {
set(new Range(min, max));
}
/**
* Sets the specified range of characters in the character set so that
* subsequent calls to <code>test</code> for characters within the range will
* return <code>true</code>.
*
* @see #union
*/
private void set(Range r) {
if (ranges.isEmpty()) {
ranges.add(r);
refreshAsciiSet();
return;
}
int pos = find(r.first);
if (((pos != ranges.size()) && ranges.get(pos).includes(r)) ||
((pos != 0) && ranges.get(pos - 1).includes(r))) {
return;
}
if ((pos != 0) && ranges.get(pos - 1).mergeable(r)) {
merge(pos - 1, r);
} else if ((pos != ranges.size()) && ranges.get(pos).mergeable(r)) {
merge(pos, r);
} else {
ranges.add(pos, r);
}
refreshAsciiSet();
}
/**
* @see #clear
*/
protected void clear(char min, char max) {
clear(new Range(min, max));
}
/**
* Clears the specified range of characters from the character set so that
* subsequent calls to <code>test</code> for characters within the range will
* return <code>false</code>.
*
* @see #difference
*/
private void clear(Range r) {
if (ranges.isEmpty()) {
return;
}
int pos = find(r.first);
if (pos > 0) {
Range prev = ranges.get(pos - 1);
if (prev.includes(r.first)) {
if (prev.last > r.last) {
Range n = new Range(r.last + 1, prev.last);
prev.last = r.first - 1;
ranges.add(pos, n);
refreshAsciiSet();
return;
} else {
prev.last = r.first - 1;
}
}
}
while ((pos < ranges.size()) && r.includes(ranges.get(pos))) {
ranges.remove(pos);
}
if ((pos < ranges.size()) && ranges.get(pos).includes(r.last)) {
ranges.get(pos).first = r.last + 1;
}
refreshAsciiSet();
}
/**
* Reconstructs the BitSet representation of the ASCII characters in the
* set, so that it matches what's stored in ranges.
*/
private void refreshAsciiSet() {
asciiSet.clear();
for (char ch = MIN_CHAR; ch <= MAX_ASCII_CHAR; ch++) {
if (testRanges(ch)) {
asciiSet.set(ch);
}
}
}
/**
* Returns the size of the range array.
*/
protected int size() {
return ranges.size();
}
/**
* Find the position in the range array for which the beginning of the
* specified range is greater than or equal to the range at that position. In
* other words, it returns the insertion point for the specified range.
*
* @param first The start of the range to find the insertion point for.
*
* @see #test
* @see #set
* @see #clear
* @see Arrays#binarySearch
*/
private int find(int first) {
int s = 0;
int e = ranges.size() - 1;
while (s <= e) {
int m = (s + e) / 2; // equivalent to: m = s + (e - s) / 2;
Range r = ranges.get(m);
if (r.first < first) {
s = m + 1;
} else if (r.first > first) {
e = m - 1;
} else {
return m;
}
}
return s;
}
/**
* Merge the specified range with the range at the specified position in the
* range array. After performing the merge operation, we iterate down the
* range array and continue merging any newly mergeable ranges. The specified
* range and the range at the specified position in the range array must be
* mergeable.
*
* @see #set
* @see #clear
*/
private void merge(int pos, Range r) {
Range t = ranges.get(pos);
t.merge(r);
pos += 1;
while ((pos < ranges.size()) && t.mergeable(ranges.get(pos))) {
t.merge(ranges.get(pos));
ranges.remove(pos);
}
}
/**
* Creates a new character set which matches a character if that character
* does not match the <code>subject</code> character set. This operation is
* implemented by taking the difference of the <code>ANYCHAR</code> character
* set and the <code>subject</code> character set.
*
* ~subject --> anychar - subject
*
* @param subject The source character set.
*/
public static Chset not(Chset subject) {
return difference(ANYCHAR, subject);
}
/**
* Creates a new character set which matches a character if that character
* matches either the <code>left</code> or <code>right</code> character sets.
*
* left | right
*
*
* @param left The left source character set.
*
* @param right The right source character set.
*/
public static Chset union(Chset left, Chset right) {
Chset n = (Chset) left.clone();
for (Range r : right.ranges) {
n.set(r);
}
return n;
}
/**
* Creates a new character set which matches a character if that character
* matches the <code>left</code> character set but does not match the
* <code>right</code> character set.
*
* left - right
*
* @param left The left source character set.
*
* @param right The right source character set.
*/
public static Chset difference(Chset left, Chset right) {
Chset n = (Chset) left.clone();
for (Range r : right.ranges) {
n.clear(r);
}
return n;
}
/**
* Creates a new character set which matches a character if that character
* matches both the <code>left</code> and <code>right</code> character sets.
*
* left & right --> left - ~right
*
* @param left The left source character set.
*
* @param right The right source character set.
*/
public static Chset intersection(Chset left, Chset right) {
return difference(left, not(right));
}
/**
* Creates a new character set which matches a character if that character
* matches the <code>left</code> character set or the <code>right</code>
* character set, but not both.
*
* left ^ right --> (left - right) | (right - left)
*
* @param left The left source character set.
*
* @param right The right source character set.
*/
public static Chset xor(Chset left, Chset right) {
return union(difference(left, right), difference(right, left));
}
@Override
public String toString() {
StringBuffer buf = new StringBuffer();
for (int i = 0; i < ranges.size(); i++) {
Range r = ranges.get(i);
if (i > 0) {
buf.append(" ");
}
buf.append(r.first);
buf.append("-");
buf.append(r.last);
}
return buf.toString();
}
/**
* The <code>Range</code> class represents a range from
* <code>[first,last]</code>. It is used by the <code>Chset</code> class to
* implement character sets for large alphabets where a bitmap based approach
* would be too expensive in terms of memory. The <code>first</code> and
* <code>last</code> member variables are specified as integers to avoid
* casting that would be necessary if they were specified as characters due to
* java's type promotion rules.
*
*
*/
static class Range {
int first;
int last;
/**
* Class constructor.
*
* @param first The beginning of the range.
*
* @param last The end of the range.
*/
Range(int first, int last) {
if (first > last) {
throw new IllegalArgumentException("descending ranges not supported: " +
first + "-" + last);
}
this.first = first;
this.last = last;
}
/**
* Tests whether the specified character lies within the target range.
*
* @param ch The character to test for inclusion.
*
* @see #test
* @see #set
* @see #clear
*/
boolean includes(int ch) {
return (first <= ch) && (ch <= last);
}
/**
* Tests whether the specified range lies entirely within the target range.
*
* @param r The range to test for inclusion.
*
* @see #set
* @see #clear
*/
boolean includes(Range r) {
return (first <= r.first) && (r.last <= last);
}
/**
* Tests whether the specified range is mergeable with the target range. Two
* ranges are mergeable if they can be replaced with a single range that
* spans exactly the same range of values.
*
* @param r The range to test for mergeability with.
*
* @see #set
*/
boolean mergeable(Range r) {
// A range is mergeable if there are no gaps between the ranges. If there
// is a gap, then it will be obvious as the difference in the extremes will
// be greater than the sum of the ranges.
return (1 + Math.max(last, r.last) - Math.min(first, r.first)) <=
((1 + r.last - r.first) + (1 + last - first));
}
/**
* Merges the specified range with the target range. This function is simple
* minded and will produce unexpected results if the ranges being merged are
* not <code>mergeable</code>.
*
* @param r The range to merge with.
*
* @see #set
* @see #merge
*/
void merge(Range r) {
first = Math.min(first, r.first);
last = Math.max(last, r.last);
}
}
}