/*
* Copyright (c) 1998-2010 Caucho Technology -- all rights reserved
*
* This file is part of Resin(R) Open Source
*
* Each copy or derived work must preserve the copyright notice and this
* notice unmodified.
*
* Resin Open Source is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* Resin Open Source is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty
* of NON-INFRINGEMENT. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License
* along with Resin Open Source; if not, write to the
*
* Free Software Foundation, Inc.
* 59 Temple Place, Suite 330
* Boston, MA 02111-1307 USA
*
* @author Scott Ferguson
*/
package com.caucho.quercus.lib.regexp;
import java.util.HashMap;
import com.caucho.util.*;
// TODO: non-ascii range not quite correct for unicode, and neither is
// PHP's /u unicode option
class RegexpSet {
static final int BITSET_CHARS = 128;
static RegexpSet SPACE = null;
static RegexpSet WORD = null;
static RegexpSet DIGIT = null;
static RegexpSet DOT = null;
// POSIX character classes
static RegexpSet PALNUM = null;
static RegexpSet PALPHA = null;
static RegexpSet PASCII = null;
static RegexpSet PBLANK = null;
static RegexpSet PCNTRL = null;
static RegexpSet PDIGIT = null;
static RegexpSet PGRAPH = null;
static RegexpSet PLOWER = null;
static RegexpSet PPRINT = null;
static RegexpSet PPUNCT = null;
static RegexpSet PSPACE = null;
static RegexpSet PUPPER = null;
static RegexpSet PXDIGIT = null;
static HashMap<String, RegexpSet> CLASS_MAP = null;
boolean[] _bitset = new boolean[BITSET_CHARS];
IntSet _range;
/**
* Create a new RegexpSet
*/
RegexpSet() {
_range = new IntSet();
}
/**
* Create a new RegexpSet
*/
RegexpSet(RegexpSet old) {
System.arraycopy(old._bitset, 0, _bitset, 0, _bitset.length);
_range = (IntSet) old._range.clone();
}
/**
* Ors two character sets.
*/
void mergeOr(RegexpSet b) {
for (int i = 0; i < BITSET_CHARS; i++) {
_bitset[i] = _bitset[i] || b._bitset[i];
}
_range.union(b._range);
}
/**
* Ors a set with the inverse of another.
*/
void mergeOrInv(RegexpSet b) {
for (int i = 0; i < BITSET_CHARS; i++) {
_bitset[i] = _bitset[i] || !b._bitset[i];
}
_range.unionNegate(b._range, 0, 0xfffff);
}
/**
* Set a range of characters in a character set.
*/
void setRange(int low, int high) {
// php/4es0
// http://bugs.caucho.com/view.php?id=3811
if (low > high || low < 0) {
throw new RuntimeException(
"Range out of range (" + low + ", " + high + ")");
}
if (low < BITSET_CHARS) {
for (int i = low; i < Math.min(high + 1, BITSET_CHARS); i++) {
_bitset[i] = true;
}
if (high < BITSET_CHARS) {
return;
}
low = BITSET_CHARS;
}
_range.union(low, high);
}
/**
* Calculate the intersection of two sets.
*
* @return true if disjoint
*/
boolean mergeOverlap(RegexpSet next) {
boolean isDisjoint = true;
for (int i = 0; i < BITSET_CHARS; i++) {
_bitset[i] = _bitset[i] & next._bitset[i];
if (_bitset[i]) {
isDisjoint = false;
}
}
if (_range.intersection(next._range)) {
isDisjoint = false;
}
return isDisjoint;
}
/**
* Calculate the difference of two sets.
*
* @return true if disjoint
*/
void difference(RegexpSet next) {
for (int i = 0; i < BITSET_CHARS; i++) {
_bitset[i] = _bitset[i] & !next._bitset[i];
}
_range.difference(next._range);
}
/*
* Returns true if the character is in the set.
*/
boolean match(int ch) {
if (ch < 0) {
return false;
} else if (ch < BITSET_CHARS) {
return _bitset[ch];
} else {
return _range.contains(ch);
}
}
RegexpNode createNode() {
if (_range.size() == 0) {
return new RegexpNode.AsciiSet(_bitset);
} else {
return new RegexpNode.Set(_bitset, _range);
}
}
RegexpNode createNotNode() {
if (_range.size() == 0) {
return new RegexpNode.AsciiNotSet(_bitset);
} else {
return new RegexpNode.NotSet(_bitset, _range);
}
}
int getSize() {
return _range.size();
}
static {
SPACE = new RegexpSet();
SPACE.setRange(' ', ' ');
SPACE.setRange(0x09, 0x0A); //tab to newline
SPACE.setRange(0x0C, 0x0D); //form feed to carriage return
DOT = new RegexpSet();
DOT.setRange('\n', '\n');
DIGIT = new RegexpSet();
DIGIT.setRange('0', '9');
WORD = new RegexpSet();
WORD.setRange('a', 'z');
WORD.setRange('A', 'Z');
WORD.setRange('0', '9');
WORD.setRange('_', '_');
PASCII = new RegexpSet();
PASCII.setRange(0, 0x7F);
PASCII.setRange(0x81, 0x87);
PASCII.setRange(0x89, 0x97);
PASCII.setRange(0x9A, 0xFF);
PBLANK = new RegexpSet();
PBLANK.setRange(' ', ' ');
PBLANK.setRange('\t', '\t');
PBLANK.setRange(0xA0, 0xA0);
PCNTRL = new RegexpSet();
PCNTRL.setRange(0, 0x1F);
PCNTRL.setRange(0x7F, 0x7F);
PCNTRL.setRange(0x81, 0x81);
PCNTRL.setRange(0x8D, 0x8D);
PCNTRL.setRange(0x8F, 0x90);
PCNTRL.setRange(0x9D, 0x9D);
PDIGIT = new RegexpSet();
PDIGIT.setRange('0', '9');
PDIGIT.setRange(0xB2, 0xB3);
PDIGIT.setRange(0xB9, 0xB9);
PLOWER = new RegexpSet();
PLOWER.setRange('a', 'z');
PLOWER.setRange(0x83, 0x83);
PLOWER.setRange(0x9A, 0x9A);
PLOWER.setRange(0x9C, 0x9C);
PLOWER.setRange(0x9E, 0x9E);
PLOWER.setRange(0xAA, 0xAA);
PLOWER.setRange(0xB5, 0xB5);
PLOWER.setRange(0xBA, 0xBA);
PLOWER.setRange(0xDF, 0xF6);
PLOWER.setRange(0xF8, 0xFF);
PSPACE = new RegexpSet();
PSPACE.setRange(' ', ' ');
PSPACE.setRange(0x09, 0x0D);
PSPACE.setRange(0xA0, 0xA0);
PUPPER = new RegexpSet();
PUPPER.setRange('A', 'Z');
PUPPER.setRange(0x8A, 0x8A);
PUPPER.setRange(0x8C, 0x8C);
PUPPER.setRange(0x8E, 0x8E);
PUPPER.setRange(0x9F, 0x9F);
PUPPER.setRange(0xC0, 0xD6);
PUPPER.setRange(0xD8, 0xDE);
PXDIGIT = new RegexpSet();
PXDIGIT.setRange('0', '9');
PXDIGIT.setRange('A', 'F');
PXDIGIT.setRange('a', 'f');
PALPHA = new RegexpSet();
PALPHA.mergeOr(PLOWER);
PALPHA.mergeOr(PUPPER);
PALNUM = new RegexpSet();
PALNUM.mergeOr(PALPHA);
PALNUM.mergeOr(PDIGIT);
PPUNCT = new RegexpSet();
PPUNCT.setRange(0x21, 0x2F);
PPUNCT.setRange(0x3A, 0x40);
PPUNCT.setRange(0x5B, 0x60);
PPUNCT.setRange(0x7B, 0x7E);
PPUNCT.setRange(0x82, 0x82);
PPUNCT.setRange(0x84, 0x87);
PPUNCT.setRange(0x89, 0x89);
PPUNCT.setRange(0x8B, 0x8B);
PPUNCT.setRange(0x91, 0x97);
PPUNCT.setRange(0x9B, 0x9B);
PPUNCT.setRange(0xA1, 0xBF);
PPUNCT.setRange(0xD7, 0xD7);
PPUNCT.setRange(0xF7, 0xF7);
PGRAPH = new RegexpSet();
PGRAPH.mergeOr(PALNUM);
PGRAPH.mergeOr(PPUNCT);
PPRINT = new RegexpSet();
PPRINT.mergeOr(PGRAPH);
PPRINT.setRange(' ', ' ');
PPRINT.setRange(0x09, 0x09);
PPRINT.setRange(0xA0, 0xA0);
CLASS_MAP = new HashMap<String, RegexpSet>();
CLASS_MAP.put("alnum", PALNUM); //php/4ek0
CLASS_MAP.put("alpha", PALPHA); //php/4ek1
CLASS_MAP.put("ascii", PASCII); //php/4ek2
CLASS_MAP.put("blank", PBLANK); //php/4ek3
CLASS_MAP.put("cntrl", PCNTRL); //php/4ek4
CLASS_MAP.put("digit", PDIGIT); //php/4ek5
CLASS_MAP.put("graph", PGRAPH); //php/4ek6
CLASS_MAP.put("lower", PLOWER); //php/4ek7
CLASS_MAP.put("print", PPRINT); //php/4ek8
CLASS_MAP.put("punct", PPUNCT); //php/4ek9
CLASS_MAP.put("space", PSPACE); //php/4eka
CLASS_MAP.put("upper", PUPPER); //php/4ekb
CLASS_MAP.put("xdigit", PXDIGIT); //php/4ekc
}
}