/*
* xtc - The eXTensible Compiler
* Copyright (C) 2004-2007 Robert Grimm
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* version 2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
* USA.
*/
package xtc.parser;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import xtc.util.Utilities;
/**
* A character class terminal.
*
* <p />Note that {@link #equals(Object)} only determines whether the
* two character class terminals have the same structure (that is, are
* both exclusive or non-exclusive and have the same list of character
* ranges), but does not determine whether the two character class
* terminals recognize the same characters.
*
* @author Robert Grimm
* @version $Revision: 1.18 $
*/
public class CharClass extends CharTerminal {
/** Parser for a character class specification. */
public static class Parser {
/** The string. */
protected String s;
/** The index into the string. */
protected int idx;
/**
* Create a new character class parser for the specified string.
* Note that the string must not include the leading
* '<code>[</code>' and trailing '<code>]</code>' characters.
*
* @param s The string to parse.
*/
public Parser(String s) {
this.s = s;
idx = 0;
}
/**
* Determine whether there are more characters.
*
* @return <code>true</code> if there are more characters.
*/
public boolean hasNext() {
return idx < s.length();
}
/**
* Determine whether the next character is a range delimiter
* '<code>-</code>'. Note that this test is <i>destructive</i>:
* if the next character is a range delimiter, it is consumed.
*
* @return <code>true</code> if the next character is a range
* delimiter.
*/
public boolean hasRange() {
if (idx >= s.length()) {
return false;
}
char c = s.charAt(idx);
if ('-' == c) {
idx++;
return true;
} else {
return false;
}
}
/**
* Return the next character. If the character is represented by
* an escape sequence (including Java Unicode and regex-like
* escapes), it is unescaped.
*
* @return The next character.
*/
public char next() {
char c = s.charAt(idx);
idx++;
if ('\\' != c) {
return c;
} else {
c = s.charAt(idx);
idx++;
switch (c) {
case 'b':
return '\b';
case 't':
return '\t';
case 'n':
return '\n';
case 'f':
return '\f';
case 'r':
return '\r';
case '"':
return '"';
case '\'':
return '\'';
case '-':
return '-';
case '[':
return '[';
case '\\':
return '\\';
case ']':
return ']';
case 'u':
idx += 4;
int n;
try {
n = Integer.parseInt(s.substring(idx-4, idx), 16);
} catch (NumberFormatException x) {
throw new IllegalArgumentException("Illegal Unicode escape (\'\\u"
+ s.substring(idx-4, idx)
+ "\')");
}
return (char)n;
default:
throw new IllegalArgumentException("Illegal character escape (\'\\"
+ c + "\')");
}
}
}
}
/** The flag for whether the character class is exclusive. */
public boolean exclusive;
/**
* The list of character ranges. Note that, strictly speaking, this
* should be a set of disjoint character ranges. However, it is
* implemented as a list so that a character class can be printed as
* it was specified.
*/
public List<CharRange> ranges;
/**
* Create a new, non-exclusive character class.
*
* @param ranges The list of character ranges.
*/
public CharClass(List<CharRange> ranges) {
this(false, ranges);
}
/**
* Create a new character class.
*
* @param exclusive The exclusive flag.
* @param ranges The list of character ranges.
*/
public CharClass(boolean exclusive, List<CharRange> ranges) {
this.exclusive = exclusive;
this.ranges = ranges;
}
/**
* Create a new, non-exclusive character class for the specified
* character.
*
* @param c The character.
*/
public CharClass(char c) {
exclusive = false;
ranges = new ArrayList<CharRange>(1);
ranges.add(new CharRange(c));
}
/**
* Create a new, non-exclusive character class based on the supplied
* character class specification. Note that the character class
* specification must not include the leading '<code>[</code>' and
* trailing '<code>]</code>' characters.
*
* @param s The character class specification.
*/
public CharClass(String s) {
exclusive = false;
ranges = new ArrayList<CharRange>();
Parser p = new Parser(s);
while (p.hasNext()) {
char c1 = p.next();
char c2 = (p.hasRange())? p.next() : c1;
ranges.add(new CharRange(c1, c2));
}
}
public Tag tag() {
return Tag.CHAR_CLASS;
}
/**
* Normalize this character class. This method sorts the list of
* character ranges by each range's first character and combines
* adjacent or overlapping ranges. However, it does <i>not</i> turn
* exclusive character classes into non-exclusive ones (as that
* conversion might negatively impact recognition performance).
*
* @return This character class.
*/
public CharClass normalize() {
Collections.sort(ranges);
for (int i=0; i<ranges.size()-1; i++) {
CharRange r1 = ranges.get(i);
CharRange r2 = ranges.get(i+1);
if (r1.last >= r2.last) {
ranges.remove(i+1);
i--;
} else if (r1.last >= r2.first - 1) {
ranges.set(i, new CharRange(r1.first, r2.last));
ranges.remove(i+1);
i--;
}
}
return this;
}
/**
* Determine whether this character class overlaps the specified
* character class. Two character classes overlap if they have
* common characters, though they need not necessarily be the same.
*
* @param klass The other character class.
* @return <code>true</code> if the two character classes overlap.
* @throws IllegalStateException
* Signals that either this or the specified class is exclusive.
*/
public boolean overlaps(CharClass klass) {
if (exclusive) {
throw new IllegalStateException("overlap test for exclusive character " +
"class " + this);
} else if (klass.exclusive) {
throw new IllegalStateException("overlap test for exclusive character " +
"class " + klass);
}
for (CharRange r1 : klass.ranges) {
for (CharRange r2 : ranges) {
if (r1.contains(r2.first) || r1.contains(r2.last) ||
r2.contains(r1.first) || r2.contains(r1.last)) {
return true;
}
}
}
return false;
}
/**
* Determine the number of characters covered by this character
* class. Note that for exclusive character classes this method
* returns the number of <i>excluded</i> characters.
*
* @return The number of characters for this character class.
*/
public int count() {
int count = 0;
for (CharRange r : ranges) {
count += r.count();
}
return count;
}
public int hashCode() {
int hash = 0;
for (CharRange r : ranges) {
hash += r.hashCode();
}
return hash;
}
public boolean equals(Object o) {
if (this == o) return true;
if (! (o instanceof CharClass)) return false;
CharClass other = (CharClass)o;
if (exclusive != other.exclusive) return false;
if (ranges.size() != other.ranges.size()) return false;
return ranges.containsAll(other.ranges);
}
public void write(Appendable out) throws IOException {
if (exclusive) {
out.append('!');
}
out.append('[');
for (CharRange r : ranges) {
if (r.first == r.last) {
Utilities.escape(r.first, out, Utilities.FULL_ESCAPES);
} else {
Utilities.escape(r.first, out, Utilities.FULL_ESCAPES);
out.append('-');
Utilities.escape(r.last, out, Utilities.FULL_ESCAPES);
}
}
out.append(']');
if (exclusive) {
out.append(" _");
}
}
}