/* * Copyright (c) 1998-2011 Caucho Technology -- all rights reserved * * This file is part of Resin(R) Open Source * * Each copy or derived work must preserve the copyright notice and this * notice unmodified. * * Resin Open Source is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Resin Open Source is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, or any warranty * of NON-INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with Resin Open Source; if not, write to the * * Free Software Foundation, Inc. * 59 Temple Place, Suite 330 * Boston, MA 02111-1307 USA * * @author Scott Ferguson */ package com.caucho.quercus.lib.regexp; import java.util.*; import java.util.logging.*; import com.caucho.quercus.QuercusException; import com.caucho.quercus.env.Env; import com.caucho.quercus.env.StringValue; import com.caucho.quercus.env.UnicodeBuilderValue; import com.caucho.quercus.lib.i18n.Utf8Encoder; import com.caucho.util.*; public class Regexp { private static final Logger log = Logger.getLogger(Regexp.class.getName()); private static final L10N L = new L10N(Regexp.class); public static final int FAIL = -1; public static final int SUCCESS = 0; final StringValue _rawRegexp; StringValue _pattern; int _flags; RegexpNode _prog; boolean _ignoreCase; boolean _isGlobal; int _nLoop; int _nGroup; // optim stuff CharBuffer _prefix; // initial string int _minLength; // minimum length possible for this regexp int _firstChar; boolean []_firstSet; boolean _isAnchorBegin; StringValue []_groupNames; boolean _isUnicode; boolean _isPHP5String; boolean _isUtf8; boolean _isEval; public Regexp(StringValue rawRegexp) throws IllegalRegexpException { _rawRegexp = rawRegexp; _pattern = rawRegexp; init(); Regcomp comp = new Regcomp(_flags); _prog = comp.parse(new PeekString(_pattern)); compile(_prog, comp); } protected void init() { StringValue rawRegexp = _rawRegexp; if (rawRegexp.length() < 2) { throw new IllegalStateException(L.l( "Can't find delimiters in regexp '{0}'.", rawRegexp)); } int head = 0; char delim = '/'; for (; head < rawRegexp.length() && Character.isWhitespace((delim = rawRegexp.charAt(head))); head++) { } if (delim == '{') delim = '}'; else if (delim == '[') delim = ']'; else if (delim == '(') delim = ')'; else if (delim == '<') delim = '>'; else if (delim == '\\' || Character.isLetterOrDigit(delim)) { throw new QuercusException(L.l( "Delimiter {0} in regexp '{1}' must " + "not be backslash or alphanumeric.", String.valueOf(delim), rawRegexp)); } int tail = rawRegexp.lastIndexOf(delim); if (tail <= 0) throw new QuercusException(L.l( "Can't find second {0} in regexp '{1}'.", String.valueOf(delim), rawRegexp)); StringValue sflags = rawRegexp.substring(tail + 1); StringValue pattern = rawRegexp.substring(head + 1, tail); _pattern = pattern; int flags = 0; for (int i = 0; sflags != null && i < sflags.length(); i++) { switch (sflags.charAt(i)) { case 'm': flags |= Regcomp.MULTILINE; break; case 's': flags |= Regcomp.SINGLE_LINE; break; case 'i': flags |= Regcomp.IGNORE_CASE; break; case 'x': flags |= Regcomp.IGNORE_WS; break; case 'g': flags |= Regcomp.GLOBAL; break; case 'A': flags |= Regcomp.ANCHORED; break; case 'D': flags |= Regcomp.END_ONLY; break; case 'U': flags |= Regcomp.UNGREEDY; break; case 'X': flags |= Regcomp.STRICT; break; case 'S': /* speedup */; break; case 'u': flags |= Regcomp.UTF8; break; case 'e': _isEval = true; break; default: throw new QuercusException(L.l("'{0}' is an unknown regexp flag in {1}", (char) sflags.charAt(i), rawRegexp)); } } _flags = flags; // XXX: what if unicode.semantics='true'? if ((flags & Regcomp.UTF8) != 0) { _pattern = fromUtf8(pattern); if (pattern == null) throw new QuercusException( L.l("Regexp: error converting subject to utf8")); } } public StringValue getRawRegexp() { return _rawRegexp; } public StringValue getPattern() { return _pattern; } public boolean isUTF8() { return _isUtf8; } public boolean isEval() { return _isEval; } public StringValue convertSubject(Env env, StringValue subject) { if (isUTF8()) return fromUtf8(subject); else return subject; } public StringValue convertResult(Env env, StringValue result) { if (isUTF8()) return toUtf8(env, result); else return result; } private void compile(RegexpNode prog, Regcomp comp) { _ignoreCase = (comp._flags & Regcomp.IGNORE_CASE) != 0; _isGlobal = (comp._flags & Regcomp.GLOBAL) != 0; _isAnchorBegin = (comp._flags & Regcomp.ANCHORED) != 0; _isUtf8 = (comp._flags & Regcomp.UTF8) != 0; if (prog.isAnchorBegin()) _isAnchorBegin = true; /* if (_ignoreCase) RegOptim.ignoreCase(prog); if (! _ignoreCase) RegOptim.eliminateBacktrack(prog, null); */ _minLength = prog.minLength(); _firstChar = prog.firstChar(); _firstSet = prog.firstSet(new boolean[256]); _prefix = new CharBuffer(prog.prefix()); //this._prog = RegOptim.linkLoops(prog); _nGroup = comp._maxGroup; _nLoop = comp._nLoop; _groupNames = new StringValue[_nGroup + 1]; for (Map.Entry<Integer,StringValue> entry : comp._groupNameMap.entrySet()) { StringValue groupName = entry.getValue(); if (_isUnicode) { } else if (isUTF8()) groupName.toBinaryValue("UTF-8"); _groupNames[entry.getKey().intValue()] = groupName; } } public StringValue getGroupName(int i) { return _groupNames[i]; } public boolean isGlobal() { return _isGlobal; } public boolean ignoreCase() { return _ignoreCase; } static StringValue fromUtf8(StringValue source) { StringValue target = new UnicodeBuilderValue(); int len = source.length(); for (int i = 0; i < len; i++) { char ch = source.charAt(i); if (ch < 0x80) { target.append(ch); } else if ((ch & 0xe0) == 0xc0) { if (len <= i + 1) { log.fine(L.l("Regexp: bad UTF-8 sequence, saw EOF")); return null; } char ch2 = source.charAt(++i); target.append((char) (((ch & 0x1f) << 6) + (ch2 & 0x3f))); } else if ((ch & 0xf0) == 0xe0) { if (len <= i + 2) { log.fine(L.l("Regexp: bad UTF-8 sequence, saw EOF")); return null; } char ch2 = source.charAt(++i); char ch3 = source.charAt(++i); target.append((char) (((ch & 0x0f) << 12) + ((ch2 & 0x3f) << 6) + (ch3 & 0x3f))); } else { if (i + 3 >= len) { log.fine(L.l("Regexp: bad UTF-8 sequence, saw EOF")); return null; } char ch2 = source.charAt(++i); char ch3 = source.charAt(++i); char ch4 = source.charAt(++i); int codePoint = ((ch & 0x07) << 18) + ((ch2 & 0x3F) << 12) + ((ch3 & 0x3F) << 6) + (ch4 & 0x3F); int high = ((codePoint - 0x10000) >> 10) + 0xD800; int low = (codePoint & 0x3FF) + 0xDC00; target.append((char) high); target.append((char) low); } } return target; } static StringValue toUtf8(Env env, StringValue source) { Utf8Encoder encoder = new Utf8Encoder(); return encoder.encode(env, source); } public String toString() { return getClass().getSimpleName() + "[" + _pattern + "]"; } }