/** * Copyright (c) 2001, Sergey A. Samokhodkin * All rights reserved. * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form * must reproduce the above copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials provided with the distribution. * - Neither the name of jregex nor the names of its contributors may be used * to endorse or promote products derived from this software without specific prior * written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * @version 1.2_01 */ package totalcross.util.regex; import totalcross.io.*; import totalcross.util.*; /** * A handle for a precompiled regular expression.<br> * To match a regular expression <code>myExpr</code> against a text <code>myString</code> one should first create a Pattern object:<pre> * Pattern p=new Pattern(myExpr); * </pre> * then obtain a Matcher object:<pre> * Matcher matcher=p.matcher(myText); * </pre> * The latter is an automaton that actually performs a search. It provides the following methods: * <li> search for matching substrings : matcher.find() or matcher.findAll(); * <li> test whether the text matches the whole pattern : matcher.matches(); * <li> test whether the text matches the beginning of the pattern : matcher.matchesPrefix(); * <li> search with custom options : matcher.find(int options) * <p> * <b>Flags</b><br> * Flags (see REFlags interface) change the meaning of some regular expression elements at compiletime. * These flags may be passed both as string(see Pattern(String,String)) and as bitwise OR of: * <li><b>REFlags.IGNORE_CASE</b> - enables case insensitivity * <li><b>REFlags.MULTILINE</b> - forces "^" and "$" to match both at the start and the end of line; * <li><b>REFlags.DOTALL</b> - forces "." to match eols('\r' and '\n' in ASCII); * <li><b>REFlags.IGNORE_SPACES</b> - literal spaces in expression are ignored for better readability; * <li><b>REFlags.UNICODE</b> - the predefined classes('\w','\d',etc) are referenced to Unicode; * <li><b>REFlags.XML_SCHEMA</b> - permits XML Schema regular expressions syntax extentions. * <p> * <b>Multithreading</b><br> * Pattern instances are thread-safe, i.e. the same Pattern object may be used * by any number of threads simultaniously. On the other hand, the Matcher objects * are NOT thread safe, so, given a Pattern instance, each thread must obtain * and use its own Matcher. * * @see REFlags * @see Matcher * @see Matcher#setTarget(java.lang.String) * @see Matcher#setTarget(java.lang.String,int,int) * @see Matcher#setTarget(char[],int,int) * @see MatchResult * @see MatchResult#group(int) * @see MatchResult#start(int) * @see MatchResult#end(int) * @see MatchResult#length(int) * @see MatchResult#charAt(int,int) * @see MatchResult#prefix() * @see MatchResult#suffix() */ public class Pattern implements /*Serializable,*/REFlags{ String stringRepr; // tree entry Term root,root0; // required number of memory slots int memregs; // required number of iteration counters int counters; // number of lookahead groups int lookaheads; Hashtable namedGroupMap; protected Pattern() throws PatternSyntaxException{} /** * Compiles an expression with default flags. * @param regex the Perl5-compatible regular expression string. * @exception PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax. * @see Pattern#Pattern(java.lang.String,java.lang.String) * @see Pattern#Pattern(java.lang.String,int) */ public Pattern(String regex) throws PatternSyntaxException{ this(regex,DEFAULT); } /** * Compiles a regular expression using Perl5-style flags. * The flag string should consist of letters 'i','m','s','x','u','X'(the case is significant) and a hyphen. * The meaning of letters: * <ul> * <li><b>i</b> - case insensitivity, corresponds to REFLlags.IGNORE_CASE; * <li><b>m</b> - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to REFLlags.MULTILINE flag; * <li><b>s</b> - single line treatment('.' matches \r's and \n's),corresponds to REFLlags.DOTALL; * <li><b>x</b> - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to REFLlags.IGNORE_SPACES. * <li><b>u</b> - predefined classes are regarded as belonging to Unicode, corresponds to REFLlags.UNICODE; this may yield some performance penalty. * <li><b>X</b> - compatibility with XML Schema, corresponds to REFLlags.XML_SCHEMA. * </ul> * @param regex the Perl5-compatible regular expression string. * @param flags the Perl5-compatible flags. * @exception PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax. * see REFlags */ public Pattern(String regex,String flags) throws PatternSyntaxException{ stringRepr=regex; compileInt(regex,parseFlags(flags)); } /** * Compiles a regular expression using REFlags. * The <code>flags</code> parameter is a bitwise OR of the folloing values: * <ul> * <li><b>REFLlags.IGNORE_CASE</b> - case insensitivity, corresponds to '<b>i</b>' letter; * <li><b>REFLlags.MULTILINE</b> - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to '<b>m</b>'; * <li><b>REFLlags.DOTALL</b> - single line treatment('.' matches \r's and \n's),corresponds to '<b>s</b>'; * <li><b>REFLlags.IGNORE_SPACES</b> - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to '<b>x</b>'. * <li><b>REFLlags.UNICODE</b> - predefined classes are regarded as belonging to Unicode, corresponds to '<b>u</b>'; this may yield some performance penalty. * <li><b>REFLlags.XML_SCHEMA</b> - compatibility with XML Schema, corresponds to '<b>X</b>'. * </ul> * @param regex the Perl5-compatible regular expression string. * @param flags the Perl5-compatible flags. * @exception PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax. * see REFlags */ public Pattern(String regex, int flags) throws PatternSyntaxException{ compileInt(regex,flags); } //java.util.regex.* compatibility public static Pattern compile(String regex) throws PatternSyntaxException{ Pattern p=new Pattern(); p.compileInt(regex,0); return p; } public static Pattern compile(String regex, int flags) throws PatternSyntaxException{ Pattern p=new Pattern(); p.compileInt(regex,flags); return p; } protected void compileInt(String regex,int flags) throws PatternSyntaxException{ stringRepr=regex; Term.makeTree(regex,flags,this); } /** * How many capturing groups this expression includes? */ public int groupCount(){ return memregs; } /** * Get numeric id for a group name. * @return <code>null</code> if no such name found. * @see MatchResult#group(java.lang.String) * @see MatchResult#isCaptured(java.lang.String) */ public Integer groupId(String name){ return ((Integer)namedGroupMap.get(name)); } /** * A shorthand for Pattern.matcher(String).matches().<br> * @param s the target * @return true if the entire target matches the pattern * @see Matcher#matches() * @see Matcher#matches(String) */ public boolean matches(String s){ return matcher(s).matches(); } /** * A shorthand for Pattern.matcher(String).matchesPrefix().<br> * @param s the target * @return true if the entire target matches the beginning of the pattern * @see Matcher#matchesPrefix() */ public boolean startsWith(String s){ return matcher(s).matchesPrefix(); } /** * Returns a targetless matcher. * Don't forget to supply a target. */ public Matcher matcher(){ return new Matcher(this); } /** * Returns a matcher for a specified string. */ public Matcher matcher(String s){ Matcher m=new Matcher(this); m.setTarget(s); return m; } /** * Returns a matcher for a specified region. */ public Matcher matcher(char[] data,int start,int end){ Matcher m=new Matcher(this); m.setTarget(data,start,end); return m; } /** * Returns a matcher for a match result (in a performance-friendly way). * <code>groupId</code> parameter specifies which group is a target. * @param groupId which group is a target; either positive integer(group id), or one of MatchResult.MATCH,MatchResult.PREFIX,MatchResult.SUFFIX,MatchResult.TARGET. */ public Matcher matcher(MatchResult res,int groupId){ Matcher m=new Matcher(this); if(res instanceof Matcher){ m.setTarget((Matcher)res,groupId); } else{ m.setTarget(res.targetChars(),res.start(groupId)+res.targetStart(),res.length(groupId)); } return m; } /** * Just as above, yet with symbolic group name. * @exception NullPointerException if there is no group with such name */ public Matcher matcher(MatchResult res,String groupName){ Integer id=res.pattern().groupId(groupName); if(id==null) throw new IllegalArgumentException("group not found:"+groupName); int group=id.intValue(); return matcher(res,group); } /** * Returns a matcher taking a text stream as target. * <b>Note that this is not a true POSIX-style stream matching</b>, i.e. the whole length of the text is preliminary read and stored in a char array. * @param text a text stream * @param length the length to read from a stream; if <code>len</code> is <code>-1</code>, the whole stream is read in. * @exception IOException indicates an IO problem * @exception OutOfMemoryError if a stream is too lengthy */ public Matcher matcher(CharStream text,int length)throws IOException{ Matcher m=new Matcher(this); m.setTarget(text,length); return m; } /** * Returns a replacer of a pattern by specified perl-like expression. * Such replacer will substitute all occurences of a pattern by an evaluated expression * ("$&" and "$0" will substitute by the whole match, "$1" will substitute by group#1, etc). * Example:<pre> * String text="The quick brown fox jumped over the lazy dog"; * Pattern word=new Pattern("\\w+"); * System.out.println(word.replacer("[$&]").replace(text)); * //prints "[The] [quick] [brown] [fox] [jumped] [over] [the] [lazy] [dog]" * Pattern swap=new Pattern("(fox|dog)(.*?)(fox|dog)"); * System.out.println(swap.replacer("$3$2$1").replace(text)); * //prints "The quick brown dog jumped over the lazy fox" * Pattern scramble=new Pattern("(\\w+)(.*?)(\\w+)"); * System.out.println(scramble.replacer("$3$2$1").replace(text)); * //prints "quick The fox brown over jumped lazy the dog" * </pre> * @param expr a perl-like expression, the "$&" and "${&}" standing for whole match, the "$N" and "${N}" standing for group#N, and "${Foo}" standing for named group Foo. * @see Replacer */ public Replacer replacer(String expr){ return new Replacer(this,expr); } /** * Returns a replacer will substitute all occurences of a pattern * through applying a user-defined substitution model. * @param model a Substitution object which is in charge for match substitution * @see Replacer */ public Replacer replacer(Substitution model){ return new Replacer(this,model); } /** * Tokenizes a text by an occurences of the pattern. * Note that a series of adjacent matches are regarded as a single separator. * The same as new RETokenizer(Pattern,String); * @see RETokenizer * @see RETokenizer#RETokenizer(totalcross.util.regex.Pattern,java.lang.String) * */ public RETokenizer tokenizer(String text){ return new RETokenizer(this,text); } /** * Tokenizes a specified region by an occurences of the pattern. * Note that a series of adjacent matches are regarded as a single separator. * The same as new RETokenizer(Pattern,char[],int,int); * @see RETokenizer * @see RETokenizer#RETokenizer(totalcross.util.regex.Pattern,char[],int,int) */ public RETokenizer tokenizer(char[] data,int off,int len){ return new RETokenizer(this,data,off,len); } /** * Tokenizes a specified region by an occurences of the pattern. * Note that a series of adjacent matches are regarded as a single separator. * The same as new RETokenizer(Pattern,Reader,int); * @see RETokenizer */ public RETokenizer tokenizer(CharStream in,int length) throws IOException{ return new RETokenizer(this,in,length); } public String toString(){ return stringRepr; } /** * Returns a less or more readable representation of a bytecode for the pattern. */ public String toString_d(){ return root.toStringAll(); } static int parseFlags(String flags)throws PatternSyntaxException{ boolean enable=true; int len=flags.length(); int result=DEFAULT; for(int i=0;i<len;i++){ char c=flags.charAt(i); switch(c){ case '+': enable=true; break; case '-': enable=false; break; default: int flag=getFlag(c); if(enable) result|=flag; else result&=(~flag); } } return result; } static int parseFlags(char[] data,int start,int len)throws PatternSyntaxException{ boolean enable=true; int result=DEFAULT; for(int i=0;i<len;i++){ char c=data[start+i]; switch(c){ case '+': enable=true; break; case '-': enable=false; break; default: int flag=getFlag(c); if(enable) result|=flag; else result&=(~flag); } } return result; } private static int getFlag(char c)throws PatternSyntaxException{ switch(c){ case 'i': return IGNORE_CASE; case 'm': return MULTILINE; case 's': return DOTALL; case 'x': return IGNORE_SPACES; case 'u': return UNICODE; case 'X': return XML_SCHEMA; } throw new PatternSyntaxException("unknown flag: "+c); } }