package rene.util.regexp;
import java.util.*;
/**
An exception for the scanning of the regular expression.
*/
class RegExpException extends Exception
{ int pos;
String S;
public RegExpException (String s, int p)
{ super(s);
S=s;
pos=p;
}
public RegExpException (String s)
{ super(s);
pos=0;
}
public String string () { return S; }
public int pos () { return pos; }
}
/**
Holds a position in a line of characters. This is used to store
the new postion too, so that matches can advance the position
and return the match state.
*/
class Position
{ public char A[];
public int K,N;
public Position (char a[])
{ A=a; K=0; N=A.length;
}
public char get () { return A[K]; }
public boolean end () { return K>=N; }
public void advance () { K++; }
public void advance (int i) { K+=i; }
public void pos (int k) { K=k; }
public int pos () { return K; }
public int length () { return N; }
}
/**
An atom is a single letter a dot, or a range. It has a
multiplication state *, + or ?. The atom can scan
itself from the regular expression, advancing the scan
position, and it can match itself against a sting,
advancing the search position. If asked, it can find
a second match or say that there is none. In the first
case the position needs to be restored to the end
of the matched string.
*/
class Atom
{ /** The regular expression this atom belopngs to */
RegExp R;
/** The multiplicator states */
final static int mult1=0,mult01=1,mult12=2,mult012=3;
/** The state of this atom */
int Mult;
/** Match position and end of the matching string. */
int LastMatch,MatchEnd;
/** Place to store the position, which must be restored for nextMatch. */
Position P;
/** There might be a nextMatch() */
boolean Match;
public Atom (RegExp r)
{ R=r;
Mult=mult1;
}
/**
Scan yourself from the regular expression
and advance the position.
@return Success or failure.
*/
public boolean scan (Position p) throws RegExpException
{ return false;
}
/**
Does the position match? Find the longest
match first.
@return Success or failure.
*/
public boolean match (Position p)
{ return false;
}
/**
Is there another match? Restore the position,
if there is.
*/
public boolean nextMatch ()
{ return false;
}
/**
Scan the multiplicator item behind the atom.
*/
public void scanMult (Position p)
{ if (!p.end())
{ switch (p.get())
{ case '*' : Mult=mult012; break;
case '+' : Mult=mult12; break;
case '?' : Mult=mult01; break;
default : return;
}
p.advance();
}
}
/**
Note the position structure and the current position.
Set Match initality to false.
*/
public void notePosition (Position p)
{ P=p; LastMatch=P.pos(); Match=false;
}
/**
Search for one or more repetitions.
*/
public boolean canMultiple () { return Mult==mult012 || Mult==mult12; }
/**
Satisfied with zero strings or not.
*/
public boolean canVoid () { return Mult==mult012 || Mult==mult01; }
}
/**
This is an atom, which is capable of finding the longest match
and upon request by nextMatch() shorter matches.
*/
class Simple
extends Atom
{ public Simple (RegExp r)
{ super(r);
}
public boolean match (Position p)
{ notePosition(p);
if (!p.end() && matchSimple(p))
{ p.advance();
Match=true;
if (canMultiple())
{ while (!p.end() && matchSimple(p)) p.advance();
}
MatchEnd=p.pos();
return true;
}
else
{ if (canVoid()) return true;
else return false;
}
}
public boolean nextMatch ()
{ if (!Match) return false;
MatchEnd--;
if (MatchEnd<LastMatch || (MatchEnd==LastMatch && !canVoid()))
{ Match=false;
return false;
}
P.pos(MatchEnd);
Match=true;
return true;
}
/**
Override this to get useful matches.
@return The singe character in the position matches this atom.
*/
public boolean matchSimple (Position p)
{ return false;
}
}
/**
A single character match.
*/
class Char
extends Simple
{ char C;
public Char (RegExp r)
{ super(r);
}
public boolean scan (Position p) throws RegExpException
{ C=p.get();
p.advance();
scanMult(p);
return true;
}
public boolean matchSimple (Position p)
{ return (R.uppercase(p.get())==C);
}
}
class SpecialChar
extends Char
{ public SpecialChar (RegExp r, char c)
{ super(r);
C=c;
}
public boolean scan (Position p) throws RegExpException
{ p.advance();
scanMult(p);
return true;
}
}
/**
Matches any character.
*/
class Dot
extends Simple
{ public Dot (RegExp r)
{ super(r);
}
public boolean scan (Position p) throws RegExpException
{ p.advance();
scanMult(p);
return true;
}
public boolean matchSimple (Position p)
{ return true;
}
}
/**
Holds one of the ranges in a character range, or a single character.
The range may include or exclude.
*/
class RangeClass
{ boolean Exclude;
public RangeClass (boolean exclude)
{ Exclude=exclude;
}
public boolean isExclude () { return Exclude; }
public boolean inRange (char c) { return false; }
}
class CharRange extends RangeClass
{ int Min,Max;
public CharRange (int min, int max, boolean exclude)
{ super(exclude);
Min=min; Max=max;
}
public boolean inRange (char c) { return c>=Min && c<=Max; }
}
class AlphaRange extends RangeClass
{ public AlphaRange (boolean exclude)
{ super(exclude);
}
public boolean inRange (char c) { return Character.isLetter(c); }
}
class AlphaNumericRange extends RangeClass
{ public AlphaNumericRange (boolean exclude)
{ super(exclude);
}
public boolean inRange (char c) { return Character.isLetterOrDigit(c); }
}
class NumericRange extends RangeClass
{ public NumericRange (boolean exclude)
{ super(exclude);
}
public boolean inRange (char c) { return Character.isDigit(c); }
}
class ControlRange extends RangeClass
{ public ControlRange (boolean exclude)
{ super(exclude);
}
public boolean inRange (char c) { return Character.isISOControl(c); }
}
class LowerRange extends RangeClass
{ public LowerRange (boolean exclude)
{ super(exclude);
}
public boolean inRange (char c) { return Character.isLowerCase(c); }
}
class UpperRange extends RangeClass
{ public UpperRange (boolean exclude)
{ super(exclude);
}
public boolean inRange (char c) { return Character.isUpperCase(c); }
}
class SpaceRange extends RangeClass
{ public SpaceRange (boolean exclude)
{ super(exclude);
}
public boolean inRange (char c) { return Character.isSpaceChar(c); }
}
class WhiteSpaceRange extends RangeClass
{ public WhiteSpaceRange (boolean exclude)
{ super(exclude);
}
public boolean inRange (char c) { return Character.isWhitespace(c); }
}
/**
The Range class holds a vector of ranges, single characters, or named
ranges. All are subclasses of RangeClass.
*/
class Range
extends Simple
{ Vector V;
boolean Any;
public Range (RegExp r)
{ super(r);
Any=true;
}
public boolean scan (Position p) throws RegExpException
{ V=new Vector();
boolean exclude=false;
p.advance();
while (!p.end() && p.get()!=']')
{ if (p.get()=='^')
{ exclude=true; p.advance();
}
if (!exclude) Any=false;
char a=getNext(p);
if (a=='[') scanNamedRange(p,exclude);
else
{ char b=a;
if (p.get()=='-')
{ p.advance();
b=getNext(p);
}
V.addElement(new CharRange(a,b,exclude));
}
}
if (p.end() || p.get()!=']')
throw new RegExpException("bracket.range",p.pos());
p.advance();
scanMult(p);
return true;
}
public void scanNamedRange (Position p, boolean exclude)
throws RegExpException
{ if (getNext(p)!=':')
throw new RegExpException("bracket.namedrange",p.pos());
StringBuffer b=new StringBuffer();
while (true)
{ char a=getNext(p);
if (a==':') break;
b.append(a);
}
if (getNext(p)!=']')
throw new RegExpException("bracket.namedrange",p.pos());
String s=b.toString();
if (s.equals("alpha")) V.addElement(new AlphaRange(exclude));
else if (s.equals("digit")) V.addElement(new NumericRange(exclude));
else if (s.equals("alnum")) V.addElement(new AlphaNumericRange(exclude));
else if (s.equals("cntrl")) V.addElement(new ControlRange(exclude));
else if (s.equals("lower")) V.addElement(new LowerRange(exclude));
else if (s.equals("upper")) V.addElement(new UpperRange(exclude));
else if (s.equals("space")) V.addElement(new SpaceRange(exclude));
else if (s.equals("white")) V.addElement(new WhiteSpaceRange(exclude));
else
throw new RegExpException("bracket.namedrange",p.pos());
}
/**
Get the next position and scan \] etc. correctly.
@return 0 on failure.
*/
public char getNext (Position p) throws RegExpException
{ if (p.end()) throw new RegExpException("bracket.range");
char c=p.get();
if (c=='\\')
{ p.advance();
if (p.end())
throw new RegExpException("illegal.backslash",p.pos());
c=p.get();
if (c=='t') c=(char)9;
}
p.advance();
if (p.end())
throw new RegExpException("bracket.range",p.pos());
return c;
}
/**
Walk through the vector of ranges and set the range.
*/
public boolean matchSimple (Position p)
{ boolean match=Any;
for (int i=0; i<V.size(); i++)
{ RangeClass r=(RangeClass)V.elementAt(i);
if (r.isExclude())
{ if (r.inRange(R.uppercase(p.get()))) return false;
}
else
{ if (r.inRange(R.uppercase(p.get()))) match=true;
}
}
return match;
}
}
/**
This scans and matches (expression).
*/
class Bracket
extends Atom
{ Part P;
boolean Top;
int EN;
Position Pos;
int K;
public Bracket (RegExp r, boolean top)
{ super(r); Top=top;
EN=r.EN;
r.EN++;
}
public boolean scan (Position p) throws RegExpException
{ p.advance();
P=new Part(R,false);
P.scan(p);
if (p.end() || p.get()!=')')
throw new RegExpException("round.bracket",p.pos());
p.advance();
return true;
}
public boolean match (Position p)
{ Pos=p;
K=p.pos();
boolean result=P.match(p);
if (result && Top)
R.E.insertElementAt(new PositionRange(K,p.pos()),EN);
return result;
}
public boolean nextMatch ()
{ boolean result=P.nextMatch();
if (result && Top)
R.E.insertElementAt(new PositionRange(K,Pos.pos()),EN);
return result;
}
}
/**
Pos matches the nullstring at a specified position.
*/
class Pos
extends Atom
{ int P;
public Pos (RegExp r, int pos)
{ super(r);
P=pos;
}
public boolean scan (Position p)
{ p.advance();
return true;
}
public boolean match (Position p)
{ if (P>=0) return (p.pos()==P);
else return (p.pos()==p.length()+P+1);
}
}
class Previous
extends Atom
{ int P;
public Previous (RegExp r, int p)
{ super(r);
P=p;
}
public boolean scan (Position p)
{ p.advance();
return true;
}
public boolean match (Position p)
{ try
{ String s=R.getBracket(P);
if (s==null) return false;
char a[]=s.toCharArray();
for (int i=0; i<a.length; i++)
{ if (p.end() || a[i]!=p.get()) return false;
p.advance();
}
return true;
}
catch (Exception e) { return false; }
}
}
/**
Branches are |ed to get a regular expression. Each branch
consists of a sequence of atoms.
*/
class Branch
{ RegExp R;
Vector V;
boolean Top;
public Branch (RegExp r, boolean top)
{ R=r; Top=top;
V=new Vector();
}
/**
Scan for atoms.
The atoms are recognized by their first letter.
*/
public boolean scan (Position p) throws RegExpException
{ while (!p.end())
{ char c=p.get();
Atom a;
switch(c)
{ case '.' :
a=new Dot(R);
break;
case '\\' :
p.advance();
if (!p.end())
{ switch (p.get())
{ case 't' :
a=new SpecialChar(R,(char)9); break;
default :
if (p.get()>='0' && p.get()<='9')
a=new Previous(R,p.get()-'0');
else
a=new Char(R);
break;
}
}
else throw new RegExpException("illegal.escape",p.pos());
break;
case '[' :
a=new Range(R);
break;
case '(' :
a=new Bracket(R,Top);
break;
case '|' :
return true;
case ')' :
return true;
case '^' :
a=new Pos(R,0);
break;
case '$' :
a=new Pos(R,-1);
break;
default :
a=new Char(R);
break;
}
a.scan(p);
V.addElement(a);
}
return V.size()>0;
}
public boolean match (Position p)
{ return match(p,0);
}
/**
The match is done by crawling through the atoms recursively.
The atom i is asked for another match, until everything fails.
*/
public boolean match (Position p, int i)
{ if (i>=V.size()) return false;
if (i+1>=V.size())
{ Atom a=(Atom)V.elementAt(i);
return a.match(p);
}
else
{ Atom a=(Atom)V.elementAt(i);
if (a.match(p))
{ if (match(p,i+1)) return true;
else
{ while (a.nextMatch())
{ if (match(p,i+1)) return true;
}
return false;
}
}
else return false;
}
}
/**
Search for another match.
*/
public boolean nextMatch ()
{ return nextMatch(0);
}
public boolean nextMatch (int i)
{ if (i>=V.size()) return false;
if (i+1>=V.size())
{ Atom a=(Atom)V.elementAt(i);
return a.nextMatch();
}
else
{ Atom a=(Atom)V.elementAt(i);
if (a.nextMatch())
{ if (nextMatch(i+1)) return true;
else
{ while (a.nextMatch())
{ if (nextMatch(i+1)) return true;
}
return false;
}
}
else return false;
}
}
}
/**
A part is expression|part or a single expression.
*/
class Part
{ RegExp R;
Branch Left;
Part Right;
boolean Top;
int EN;
public Part (RegExp r, boolean top)
{ R=r; Top=top;
}
public boolean scan (Position p) throws RegExpException
{ if (Top) R.EN=0;
Left=new Branch(R,Top);
Left.scan(p);
if (Top) EN=R.EN;
if (!p.end() && p.get()=='|')
{ if (Top) R.EN=0;
Right=new Part(R,Top);
p.advance();
return Right.scan(p);
}
return true;
}
/**
The match is true if the first part is true.
or the remaining parts are true.
*/
public boolean match (Position p)
{ int k=p.pos();
if (Top)
{ R.E.removeAllElements();
R.EN=0;
}
if (Left.match(p))
{ if (Top) R.EN=EN;
return true;
}
else
{ p.pos(k);
if (Right!=null) return Right.match(p);
else return false;
}
}
/**
This tests for another match of any sub-branch.
*/
public boolean nextMatch ()
{ if (Left.nextMatch()) return true;
else
{ if (Right!=null) return Right.nextMatch();
}
return false;
}
}
/**
This is a class to scan a string with a regular expression. It follows
the normal rules for regular expressions with some exceptions and
extensions. Any instance of this class can perform as a match tool for
input strings, using the match method.
<p>
Here is a formal description of a regular expression. It is has one or
more branches, separated by |. It matches anything, that matches one
of the branches.
<p>
A branch has one or more atoms concatenated. It matches the string, if
the atoms match strings, which concatenate to the given string.
<p>
An atom is either a string of non-special letters. A special letter
(such as |) becomes a non-special letter, if it is preceded by \. Or
an atom is a regular expression in (). Or it is a sequence of letters
in [] (a range). Or it is a . indicating any character.
<p>
An atom followed by * may repeat zero or more times, followed by + one
or more times, followed by ? zero or one times.
<p>
A range consists of letters, ranges of letters as in A-Z, or a ^
character, indicating that the letters or letter ranges are excluded.
The special letters must be preceded by \.
<p>
There are the predefined ranges [:alpha:], [:digit:], [:alnum:],
[:space:], [:white:], [:cntrl:], [:lower:] and [:upper:]. Note that
the brackes are part of the range definition.
<p>
Contrary to the normal implementation, ] and - must be escaped, when
they are to appear in ranges. Also, ranges may contain include and
exclude character ranges at the same time, as in [a-z^x].
<p>
The atom ^ matches only the beginning of the line, while $ matches the
line end.
*/
public class RegExp
{ /** store the regular expression string here */
String S;
/** the regular expression scanned tree */
Part Left;
/** the Valid flag */
boolean Valid=false;
/** the error string, if Valid is false */
String ErrorString;
/** the error position, if Valid is false */
int Pos;
/** the minimal length a string must have to match */
int minLength=0;
/** the found match */
int StartMatch,EndMatch;
/** a vector for the found expressions in brackets */
Vector E;
/** A counter to use for the brackets */
int EN;
/** Note the searched string here */
char A[];
/** Ignore case */
boolean IgnoreCase=false;
/**
This scans a regular expression for further usage.
The error flag may be checked with the valid()
function.
@param s The regular expression.
*/
public RegExp (String s, boolean ignorecase)
{ if (ignorecase) s=s.toUpperCase();
S=s;
E=new Vector();
IgnoreCase=ignorecase;
char A[]=S.toCharArray();
Position p=new Position(A);
Left=new Part(this,true);
ErrorString="";
try
{ Left.scan(p);
Valid=true;
}
catch (RegExpException e)
{ Valid=false;
ErrorString=e.string();
Pos=e.pos();
}
catch (Exception e)
{ Valid=false;
ErrorString="internal.error";
Pos=0;
}
}
/**
Checks the error state for the regular expression.
@return true, if there is no error.
*/
public boolean valid ()
{ return Valid;
}
/**
The error string tries to explain the error, when
valid() return false.
@return the error string
*/
public String errorString ()
{ return ErrorString;
}
/**
The position, where the scan error occured.
@return the error position.
*/
public int errorPos ()
{ return Pos;
}
/**
Match the regular expression against a string.
@return true, if a match was found.
*/
public boolean match (String s)
{ char A[]=s.toCharArray();
return match(A,0);
}
public boolean match (char a[], int pos)
{ A=a;
Position p=new Position(A);
int i,n=A.length-minLength;
for (i=pos; i<=n; i++)
{ p.pos(i);
if (Left.match(p))
{ StartMatch=i;
EndMatch=p.pos();
return true;
}
}
return false;
}
/**
@return start position of matching string.
*/
public int startMatch ()
{ return StartMatch;
}
/**
@return end of the matching string.
*/
public int endMatch ()
{ return EndMatch;
}
/**
A main() to test the scanner.
*/
public static void main (String args[])
{ RegExp R=new RegExp(args[0],false);
if (R.Valid)
{ if (R.match(args[1]))
{ System.out.println("Matched from "+R.StartMatch+
" to "+R.EndMatch);
System.out.println(R.EN+" brackets assigned");
for (int i=0; i<R.EN; i++)
{ System.out.println(i+": "+R.expand("("+i+")"));
}
}
}
else System.out.println(R.ErrorString+" at "+R.Pos);
}
/**
Return an enumeration with the found brackets. The
objects are instances of the PositionRange class.
@see rene.regexp.PositionRange
*/
public Enumeration getBrackets ()
{ return E.elements();
}
public int getBracketNumber ()
{ return E.size();
}
public String getBracket (int i)
{ try
{ PositionRange r=(PositionRange)E.elementAt(i);
return new String(A,r.start(),r.end()-r.start());
}
catch (Exception e)
{ return null;
}
}
/**
Expand the replacement string and change (1), (2)
etc. to the found bracket expansions.
@return expanded string or null on error.
*/
public String expand (String s)
{ try
{ StringBuffer B=new StringBuffer();
s=s.replace("\\t","\t");
StringTokenizer T=new StringTokenizer(s,"\\()",true);
while (T.hasMoreTokens())
{ String a=T.nextToken();
if (a.equals("("))
{ String b=T.nextToken();
String c=T.nextToken();
if (!c.equals(")")) return null;
PositionRange p=
(PositionRange)E.elementAt(Integer.parseInt(b));
B.append(new String(A,p.start(),p.end()-p.start()));
}
else if (a.equals("\\"))
{ a=T.nextToken();
B.append(a);
}
else B.append(a);
}
return B.toString();
}
catch (Exception e)
{ return null;
}
}
char uppercase (char c)
{ if (IgnoreCase) return Character.toUpperCase(c);
else return c;
}
}