CharClass.java example

Explorer
xtc-master
/*
 * xtc - The eXTensible Compiler
 * Copyright (C) 2004 Robert Grimm
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */
package xtc.parser;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

import xtc.util.Utilities;

/**
 * A character class terminal.
 *
 * <p />Note that {@link #equals(Object)} only determines whether the
 * two character class terminals have the same structure (that is, are
 * both exclusive or non-exclusive and have the same list of character
 * ranges), but does not determine whether the two character class
 * terminals recognize the same characters.
 *
 * @author Robert Grimm
 * @version $Revision: 1.1 $
 */
public class CharClass extends CharTerminal {

  /** Parser for a character class specification. */
  public static class Parser {

    /** The string. */
    protected String s;

    /** The index into the string. */
    protected int    idx;

    /**
     * Create a new character class parser for the specified string.
     * Note that the string must not include the leading
     * '<code>[</code>' and trailing '<code>]</code>' characters.
     *
     * @param s The string to parse.
     */
    public Parser(String s) {
      this.s = s;
      idx    = 0;
    }

    /**
     * Determine whether there are more characters.
     *
     * @return <code>true</code> if there are more characters.
     */
    public boolean hasNext() {
      return idx < s.length();
    }

    /**
     * Determine whether the next character is a range delimiter
     * '<code>-</code>'.  Note that this test is <i>destructive</i>:
     * if the next character is a range delimiter, it is consumed.
     *
     * @return <code>true</code> if the next character is a range
     *   delimiter.
     */
    public boolean hasRange() {
      if (idx >= s.length()) {
        return false;
      }

      char c = s.charAt(idx);

      if ('-' == c) {
        idx++;
        return true;
      } else {
        return false;
      }
    }

    /**
     * Return the next character.  If the character is represented by
     * an escape sequence (including Java Unicode and regex-like
     * escapes), it is unescaped.
     *
     * @return The next character.
     */
    public char next() {
      char c = s.charAt(idx);
      idx++;

      if ('\\' != c) {
        return c;

      } else {
        c = s.charAt(idx);
        idx++;

        switch (c) {
        case 'b':
          return '\b';
        case 't':
          return '\t';
        case 'n':
          return '\n';
        case 'f':
          return '\f';
        case 'r':
          return '\r';
        case '"':
          return '"';
        case '\'':
          return '\'';
        case '-':
          return '-';
        case '[':
          return '[';
        case '\\':
          return '\\';
        case ']':
          return ']';
        case 'u':
          idx += 4;
          int n;
          try {
            n = Integer.parseInt(s.substring(idx-4, idx), 16);
          } catch (NumberFormatException x) {
            throw new IllegalArgumentException("Illegal Unicode escape (\'\\u"
                                               + s.substring(idx-4, idx)
                                               + "\')");
          }
          return (char)n;
        default:
          throw new IllegalArgumentException("Illegal character escape (\'\\"
                                             + c + "\')");
        }
      }
    }

  }

  /** The flag for whether the character class is exclusive. */
  public boolean exclusive;

  /**
   * The list of character ranges.  Note that, strictly speaking, this
   * should be a set of disjoint character ranges.  However, it is
   * implemented as a list so that a character class can be printed as
   * it was specified.
   */
  public List ranges;

  /**
   * Create a new, non-exclusive character class.
   *
   * @param ranges The list of character ranges.
   */
  public CharClass(List ranges) {
    this(false, ranges);
  }

  /**
   * Create a new character class.
   *
   * @param exclusive The exclusive flag.
   * @param ranges The list of character ranges.
   */
  public CharClass(boolean exclusive, List ranges) {
    this.exclusive = exclusive;
    this.ranges    = ranges;
  }

  /**
   * Create a new, non-exclusive character class for the specified
   * character.
   *
   * @param c The character.
   */
  public CharClass(char c) {
    exclusive = false;
    ranges    = new ArrayList(1);
    ranges.add(new CharRange(c));
  }

  /**
   * Create a new, non-exclusive character class based on the supplied
   * character class specification.  Note that the character class
   * specification must not include the leading '<code>[</code>' and
   * trailing '<code>]</code>' characters.
   *
   * @param s The character class specification.
   */
  public CharClass(String s) {
    exclusive = false;
    ranges    = new ArrayList();
    Parser p  = new Parser(s);

    while (p.hasNext()) {
      char c1 = p.next();
      char c2 = (p.hasRange())? p.next() : c1;
      ranges.add(new CharRange(c1, c2));
    }
  }

  /**
   * Normalize this character class.  This method sorts the list of
   * character ranges by each range's first character and combines
   * adjacent or overlapping ranges.  However, it does <i>not</i> turn
   * exclusive character classes into non-exclusive ones (as that
   * conversion might negatively impact recognition performance).
   *
   * @return This character class.
   */
  public CharClass normalize() {
    Collections.sort(ranges);

    for (int i=0; i<ranges.size()-1; i++) {
      CharRange r1 = (CharRange)ranges.get(i);
      CharRange r2 = (CharRange)ranges.get(i+1);
      if (r1.last >= r2.last) {
        ranges.remove(i+1);
        i--;
      } else if (r1.last >= r2.first - 1) {
        ranges.set(i, new CharRange(r1.first, r2.last));
        ranges.remove(i+1);
        i--;
      }
    }

    return this;
  }

  /**
   * Determine whether this character class overlaps the specified
   * character class.  Two character classes overlap if they have
   * common characters, though they need not necessarily be the same.
   * Note that the result of this method is only well-defined if both
   * character classes are non-exclusive.
   *
   * @param klass The other character class.
   * @return <code>true</code> if the two character classes overlap.
   */
  public boolean overlaps(CharClass klass) {
    Iterator      iter  = klass.ranges.iterator();
    while (iter.hasNext()) {
      CharRange   r1    = (CharRange)iter.next();
      Iterator    iter2 = ranges.iterator();
      while (iter2.hasNext()) {
        CharRange r2    = (CharRange)iter2.next();
        if (r1.contains(r2.first) || r1.contains(r2.last) ||
            r2.contains(r1.first) || r2.contains(r1.last)) {
          return true;
        }
      }
    }
    return false;
  }

  /**
   * Determine the number of characters covered by this character
   * class.  Note that for exclusive character classes this method
   * returns the number of <i>excluded</i> characters.
   *
   * @return The number of characters for this character class.
   */
  public int count() {
    int      count = 0;
    Iterator iter  = ranges.iterator();
    while (iter.hasNext()) {
      count += ((CharRange)iter.next()).count();
    }
    return count;
  }

  public int hashCode() {
    int      hash = 0;

    Iterator iter = ranges.iterator();
    while (iter.hasNext()) {
      hash += iter.next().hashCode();
    }

    return hash;
  }

  public boolean equals(Object o) {
    if (this == o) return true;
    if (! (o instanceof CharClass)) return false;
    CharClass other = (CharClass)o;
    if (exclusive != other.exclusive) return false;
    if (ranges.size() != other.ranges.size()) return false;
    return ranges.containsAll(other.ranges);
  }

  public String toString() {
    StringBuffer buf = new StringBuffer();

    if (exclusive) {
      buf.append('!');
    }

    buf.append('[');
    Iterator iter = ranges.iterator();
    while (iter.hasNext()) {
      CharRange r = (CharRange)iter.next();

      if (r.first == r.last) {
        Utilities.escape(r.first, buf, Utilities.FULL_ESCAPES);
      } else {
        Utilities.escape(r.first, buf, Utilities.FULL_ESCAPES);
        buf.append('-');
        Utilities.escape(r.last,  buf, Utilities.FULL_ESCAPES);
      }
    }
    buf.append(']');

    if (exclusive) {
      buf.append(" .");
    }

    return buf.toString();
  }

}