RegExp.java example

Explorer
DragonGoApp-master
- src
  - fr
    - xtof54
  - rene
    - util
package rene.util.regexp;

import java.util.*;

/**
An exception for the scanning of the regular expression.
*/

class RegExpException extends Exception
{	int pos;
	String S;
	public RegExpException (String s, int p)
	{	super(s);
		S=s;
		pos=p;
	}
	public RegExpException (String s)
	{	super(s);
		pos=0;
	}
	public String string () { return S; }
	public int pos () { return pos; }
}

/**
Holds a position in a line of characters. This is used to store
the new postion too, so that matches can advance the position
and return the match state.
*/

class Position
{	public char A[];
	public int K,N;
	public Position (char a[])
	{	A=a; K=0; N=A.length;
	}
	public char get () { return A[K]; }
	public boolean end () { return K>=N; }
	public void advance () { K++; }
	public void advance (int i) { K+=i; }
	public void pos (int k) { K=k; }
	public int pos () { return K; }
	public int length () { return N; }
}

/**
An atom is a single letter a dot, or a range. It has a
multiplication state *, + or ?. The atom can scan
itself from the regular expression, advancing the scan
position, and it can match itself against a sting,
advancing the search position. If asked, it can find
a second match or say that there is none. In the first
case the position needs to be restored to the end
of the matched string.
*/

class Atom
{	/** The regular expression this atom belopngs to */	
	RegExp R;
	/** The multiplicator states */
	final static int mult1=0,mult01=1,mult12=2,mult012=3;
	/** The state of this atom */
	int Mult;
	/** Match position and end of the matching string. */
	int LastMatch,MatchEnd;
	/** Place to store the position, which must be restored for nextMatch. */
	Position P;
	/** There might be a nextMatch() */
	boolean Match;
	public Atom (RegExp r)
	{	R=r;
		Mult=mult1;
	}
	/**
	Scan yourself from the regular expression
	and advance the position.
	@return Success or failure.
	*/
	public boolean scan (Position p) throws RegExpException
	{	return false;
	}
	/**
	Does the position match? Find the longest
	match first.
	@return Success or failure.
	*/
	public boolean match (Position p)
	{	return false;
	}
	/**
	Is there another match? Restore the position,
	if there is.
	*/
	public boolean nextMatch ()
	{	return false;
	}
	/**
	Scan the multiplicator item behind the atom.
	*/
	public void scanMult (Position p)
	{	if (!p.end())
		{	switch (p.get())
			{	case '*' : Mult=mult012; break;
				case '+' : Mult=mult12; break;
				case '?' : Mult=mult01; break;
				default : return;
			}
			p.advance();
		}
	}
	/**
	Note the position structure and the current position.
	Set Match initality to false.
	*/
	public void notePosition (Position p)
	{	P=p; LastMatch=P.pos(); Match=false;
	}
	/**
	Search for one or more repetitions.
	*/
	public boolean canMultiple () { return Mult==mult012 || Mult==mult12; }
	/**
	Satisfied with zero strings or not.
	*/
	public boolean canVoid () { return Mult==mult012 || Mult==mult01; }
}

/**
This is an atom, which is capable of finding the longest match
and upon request by nextMatch() shorter matches.
*/

class Simple
	extends Atom
{	public Simple (RegExp r)
	{	super(r);
	}
	public boolean match (Position p)
	{	notePosition(p);
		if (!p.end() && matchSimple(p))
		{	p.advance();
			Match=true;
			if (canMultiple())
			{	while (!p.end() && matchSimple(p)) p.advance();
			}
			MatchEnd=p.pos();
			return true;
		}
		else
		{	if (canVoid()) return true;
			else return false;
		}
	}
	public boolean nextMatch ()
	{	if (!Match) return false;
		MatchEnd--;
		if (MatchEnd<LastMatch || (MatchEnd==LastMatch && !canVoid()))
		{	Match=false;	
			return false;
		}
		P.pos(MatchEnd);
		Match=true;
		return true;
	}
	/**
	Override this to get useful matches.
	@return The singe character in the position matches this atom.
	*/
	public boolean matchSimple (Position p)
	{	return false;
	}
}

/**
A single character match.
*/

class Char
	extends Simple
{	char C;	
	public Char (RegExp r)
	{	super(r);
	}
	public boolean scan (Position p) throws RegExpException
	{	C=p.get();
		p.advance();
		scanMult(p);
		return true;
	}
	public boolean matchSimple (Position p)
	{	return (R.uppercase(p.get())==C);
	}
}

class SpecialChar
	extends Char
{	public SpecialChar (RegExp r, char c)
	{	super(r);
		C=c;
	}
	public boolean scan (Position p) throws RegExpException
	{	p.advance();
		scanMult(p);
		return true;
	}
}

/**
Matches any character.
*/

class Dot
	extends Simple
{	public Dot (RegExp r)
	{	super(r);
	}
	public boolean scan (Position p) throws RegExpException
	{	p.advance();
		scanMult(p);
		return true;
	}
	public boolean matchSimple (Position p)
	{	return true;
	}	
}

/**
Holds one of the ranges in a character range, or a single character.
The range may include or exclude.
*/

class RangeClass
{	boolean Exclude;
	public RangeClass (boolean exclude)
	{	Exclude=exclude;
	}
	public boolean isExclude () { return Exclude; }
	public boolean inRange (char c) { return false; }
}

class CharRange extends RangeClass
{	int Min,Max;
	public CharRange (int min, int max, boolean exclude)
	{	super(exclude);
		Min=min; Max=max;
	}
	public boolean inRange (char c) { return c>=Min && c<=Max; }
}

class AlphaRange extends RangeClass
{	public AlphaRange (boolean exclude)
	{	super(exclude);
	}
	public boolean inRange (char c) { return Character.isLetter(c); }
}

class AlphaNumericRange extends RangeClass
{	public AlphaNumericRange (boolean exclude)
	{	super(exclude);
	}
	public boolean inRange (char c) { return Character.isLetterOrDigit(c); }
}

class NumericRange extends RangeClass
{	public NumericRange (boolean exclude)
	{	super(exclude);
	}
	public boolean inRange (char c) { return Character.isDigit(c); }
}

class ControlRange extends RangeClass
{	public ControlRange (boolean exclude)
	{	super(exclude);
	}
	public boolean inRange (char c) { return Character.isISOControl(c); }
}

class LowerRange extends RangeClass
{	public LowerRange (boolean exclude)
	{	super(exclude);
	}
	public boolean inRange (char c) { return Character.isLowerCase(c); }
}

class UpperRange extends RangeClass
{	public UpperRange (boolean exclude)
	{	super(exclude);
	}
	public boolean inRange (char c) { return Character.isUpperCase(c); }
}

class SpaceRange extends RangeClass
{	public SpaceRange (boolean exclude)
	{	super(exclude);
	}
	public boolean inRange (char c) { return Character.isSpaceChar(c); }
}

class WhiteSpaceRange extends RangeClass
{	public WhiteSpaceRange (boolean exclude)
	{	super(exclude);
	}
	public boolean inRange (char c) { return Character.isWhitespace(c); }
}

/**
The Range class holds a vector of ranges, single characters, or named
ranges. All are subclasses of RangeClass.
*/

class Range
	extends Simple
{	Vector V;
	boolean Any;
	public Range (RegExp r)
	{	super(r);
		Any=true;
	}
	public boolean scan (Position p) throws RegExpException
	{	V=new Vector();	
		boolean exclude=false;
		p.advance();
		while (!p.end() && p.get()!=']')
		{	if (p.get()=='^')
			{	exclude=true; p.advance();
			}
			if (!exclude) Any=false;
			char a=getNext(p);
			if (a=='[') scanNamedRange(p,exclude);
			else
			{	char b=a;
				if (p.get()=='-')
				{	p.advance();
					b=getNext(p);
				}
				V.addElement(new CharRange(a,b,exclude));
			}
		}
		if (p.end() || p.get()!=']')
			throw new RegExpException("bracket.range",p.pos());
		p.advance();
		scanMult(p);
		return true;
	}
	public void scanNamedRange (Position p, boolean exclude) 
		throws RegExpException
	{	if (getNext(p)!=':')
			throw new RegExpException("bracket.namedrange",p.pos());
		StringBuffer b=new StringBuffer();
		while (true)
		{	char a=getNext(p);
			if (a==':') break;
			b.append(a);
		}
		if (getNext(p)!=']')
			throw new RegExpException("bracket.namedrange",p.pos());
		String s=b.toString();
		if (s.equals("alpha")) V.addElement(new AlphaRange(exclude));
		else if (s.equals("digit")) V.addElement(new NumericRange(exclude));
		else if (s.equals("alnum")) V.addElement(new AlphaNumericRange(exclude));
		else if (s.equals("cntrl")) V.addElement(new ControlRange(exclude));
		else if (s.equals("lower")) V.addElement(new LowerRange(exclude));
		else if (s.equals("upper")) V.addElement(new UpperRange(exclude));
		else if (s.equals("space")) V.addElement(new SpaceRange(exclude));
		else if (s.equals("white")) V.addElement(new WhiteSpaceRange(exclude));
		else
			throw new RegExpException("bracket.namedrange",p.pos());
	}
	/**
	Get the next position and scan \] etc. correctly.
	@return 0 on failure.
	*/
	public char getNext (Position p) throws RegExpException
	{	if (p.end()) throw new RegExpException("bracket.range");
		char c=p.get();
		if (c=='\\')
		{	p.advance();
			if (p.end())
				throw new RegExpException("illegal.backslash",p.pos());
			c=p.get();
			if (c=='t') c=(char)9;
		}
		p.advance();
		if (p.end())
			throw new RegExpException("bracket.range",p.pos());
		return c;
	}
	/**
	Walk through the vector of ranges and set the range.
	*/
	public boolean matchSimple (Position p)
	{	boolean match=Any;	
		for (int i=0; i<V.size(); i++)
		{	RangeClass r=(RangeClass)V.elementAt(i);
			if (r.isExclude())
			{	if (r.inRange(R.uppercase(p.get()))) return false;
			}
			else
			{	if (r.inRange(R.uppercase(p.get()))) match=true;
			}	
		}
		return match;
	}
}

/**
This scans and matches (expression).
*/

class Bracket
	extends Atom
{	Part P;	
	boolean Top;
	int EN;
	Position Pos;
	int K;
	public Bracket (RegExp r, boolean top)
	{	super(r); Top=top;
		EN=r.EN;
		r.EN++;
	}
	public boolean scan (Position p) throws RegExpException
	{	p.advance();
		P=new Part(R,false);
		P.scan(p);
		if (p.end() || p.get()!=')')
			throw new RegExpException("round.bracket",p.pos());
		p.advance();
		return true;
	}
	public boolean match (Position p)
	{	Pos=p;	
		K=p.pos();
		boolean result=P.match(p);
		if (result && Top)
			R.E.insertElementAt(new PositionRange(K,p.pos()),EN);
		return result;
	}
	public boolean nextMatch ()
	{	boolean result=P.nextMatch();
		if (result && Top)
			R.E.insertElementAt(new PositionRange(K,Pos.pos()),EN);
		return result;
	}
}

/**
Pos matches the nullstring at a specified position.
*/

class Pos
	extends Atom
{	int P;
	public Pos (RegExp r, int pos)
	{	super(r);	
		P=pos;
	}
	public boolean scan (Position p)
	{	p.advance();
		return true;
	}
	public boolean match (Position p)
	{	if (P>=0) return (p.pos()==P);
		else return (p.pos()==p.length()+P+1);
	}
}

class Previous
	extends Atom
{	int P;
	public Previous (RegExp r, int p)
	{	super(r);
		P=p;
	}
	public boolean scan (Position p)
	{	p.advance();
		return true;
	}
	public boolean match (Position p)
	{	try
		{	String s=R.getBracket(P);
			if (s==null) return false;
			char a[]=s.toCharArray();
			for (int i=0; i<a.length; i++)
			{	if (p.end() || a[i]!=p.get()) return false;
				p.advance();
			}
			return true;
		}
		catch (Exception e) { return false; }
	}
}

/**
Branches are |ed to get a regular expression. Each branch
consists of a sequence of atoms.
*/

class Branch
{	RegExp R;
	Vector V;
	boolean Top;
	public Branch (RegExp r, boolean top)
	{	R=r; Top=top;
		V=new Vector();
	}
	/**
	Scan for atoms.
	The atoms are recognized by their first letter.
	*/
	public boolean scan (Position p) throws RegExpException
	{	while (!p.end())
		{	char c=p.get();	
			Atom a;
			switch(c)
			{	case '.' :
					a=new Dot(R);
					break;
				case '\\' :
					p.advance();
					if (!p.end())
					{	switch (p.get())
						{	case 't' :
								a=new SpecialChar(R,(char)9); break;
							default :
								if (p.get()>='0' && p.get()<='9')
									a=new Previous(R,p.get()-'0');
								else 
									a=new Char(R);
								break;
						}
					}
					else throw new RegExpException("illegal.escape",p.pos());
					break;
				case '[' :
					a=new Range(R);
					break;
				case '(' :
					a=new Bracket(R,Top);
					break;
				case '|' :
					return true;
				case ')' :
					return true;
				case '^' :
					a=new Pos(R,0);
					break;
				case '$' :
					a=new Pos(R,-1);
					break;
				default :
					a=new Char(R);
					break;
			}
			a.scan(p);
			V.addElement(a);
		}	
		return V.size()>0;
	}
	public boolean match (Position p)
	{	return match(p,0);
	}
	/**
	The match is done by crawling through the atoms recursively.
	The atom i is asked for another match, until everything fails.
	*/
	public boolean match (Position p, int i)
	{	if (i>=V.size()) return false;
		if (i+1>=V.size()) 
		{	Atom a=(Atom)V.elementAt(i);	
			return a.match(p);
		}
		else
		{	Atom a=(Atom)V.elementAt(i);	
			if (a.match(p)) 
			{	if (match(p,i+1)) return true;
				else
				{	while (a.nextMatch())
					{	if (match(p,i+1)) return true;
					}
					return false;
				}
			}
			else return false;
		}
	}
	/**
	Search for another match.
	*/
	public boolean nextMatch ()
	{	return nextMatch(0);
	}
	public boolean nextMatch (int i)
	{	if (i>=V.size()) return false;
		if (i+1>=V.size()) 
		{	Atom a=(Atom)V.elementAt(i);	
			return a.nextMatch();
		}
		else
		{	Atom a=(Atom)V.elementAt(i);	
			if (a.nextMatch()) 
			{	if (nextMatch(i+1)) return true;
				else
				{	while (a.nextMatch())
					{	if (nextMatch(i+1)) return true;
					}
					return false;
				}
			}
			else return false;
		}
	}
}

/**
A part is expression|part or a single expression.
*/

class Part
{	RegExp R;
	Branch Left;
	Part Right;
	boolean Top;
	int EN;
	public Part (RegExp r, boolean top)
	{	R=r; Top=top;
	}
	public boolean scan (Position p) throws RegExpException
	{	if (Top) R.EN=0;
		Left=new Branch(R,Top);
		Left.scan(p);
		if (Top) EN=R.EN;
		if (!p.end() && p.get()=='|')
		{	if (Top) R.EN=0;
			Right=new Part(R,Top);
			p.advance();
			return Right.scan(p);
		}
		return true;
	}
	/**
	The match is true if the first part is true.
	or the remaining parts are true.
	*/
	public boolean match (Position p)
	{	int k=p.pos();
		if (Top)
		{	R.E.removeAllElements();
			R.EN=0;
		}
		if (Left.match(p))
		{	if (Top) R.EN=EN;	
			return true;
		}
		else
		{	p.pos(k);
			if (Right!=null) return Right.match(p);
			else return false;
		}
	}
	/**
	This tests for another match of any sub-branch. 
	*/ 
	public boolean nextMatch () 
	{	if (Left.nextMatch()) return true;
		else 
		{	if (Right!=null) return Right.nextMatch();
		}
		return false;
	}
}


/**
This is a class to scan a string with a regular expression. It follows
the normal rules for regular expressions with some exceptions and
extensions. Any instance of this class can perform as a match tool for
input strings, using the match method.
<p>
Here is a formal description of a regular expression. It is has one or
more branches, separated by |. It matches anything, that matches one
of the branches.
<p>
A branch has one or more atoms concatenated. It matches the string, if
the atoms match strings, which concatenate to the given string.
<p>
An atom is either a string of non-special letters. A special letter
(such as |) becomes a non-special letter, if it is preceded by \. Or
an atom is a regular expression in (). Or it is a sequence of letters
in [] (a range). Or it is a . indicating any character.
<p>
An atom followed by * may repeat zero or more times, followed by + one
or more times, followed by ? zero or one times.
<p>
A range consists of letters, ranges of letters as in A-Z, or a ^
character, indicating that the letters or letter ranges are excluded.
The special letters must be preceded by \.
<p>
There are the predefined ranges [:alpha:], [:digit:], [:alnum:],
[:space:], [:white:], [:cntrl:], [:lower:] and [:upper:]. Note that
the brackes are part of the range definition.
<p>
Contrary to the normal implementation, ] and - must be escaped, when
they are to appear in ranges. Also, ranges may contain include and
exclude character ranges at the same time, as in [a-z^x].
<p> 
The atom ^ matches only the beginning of the line, while $ matches the
line end. 
*/

public class RegExp
{	/** store the regular expression string here */	
	String S;
	/** the regular expression scanned tree */
	Part Left;
	/** the Valid flag */
	boolean Valid=false;
	/** the error string, if Valid is false */
	String ErrorString;
	/** the error position, if Valid is false */
	int Pos;
	/** the minimal length a string must have to match */
	int minLength=0;
	/** the found match */
	int StartMatch,EndMatch;
	/** a vector for the found expressions in brackets */
	Vector E;
	/** A counter to use for the brackets */
	int EN;
	/** Note the searched string here */
	char A[];
	/** Ignore case */
	boolean IgnoreCase=false;
	
	/**
	This scans a regular expression for further usage.
	The error flag may be checked with the valid()
	function.
	@param s The regular expression.
	*/
	public RegExp (String s, boolean ignorecase)
	{	if (ignorecase) s=s.toUpperCase();
		S=s;
		E=new Vector();
		IgnoreCase=ignorecase;
		char A[]=S.toCharArray();
		Position p=new Position(A);
		Left=new Part(this,true);
		ErrorString="";
		try
		{	Left.scan(p);
			Valid=true;
		}
		catch (RegExpException e)
		{	Valid=false;
			ErrorString=e.string();
			Pos=e.pos();
		}
		catch (Exception e)
		{	Valid=false;
			ErrorString="internal.error";
			Pos=0;
		}
	}
	
	/**
	Checks the error state for the regular expression.
	@return true, if there is no error.
	*/
	public boolean valid ()
	{	return Valid;
	}
	
	/**
	The error string tries to explain the error, when
	valid() return false.
	@return the error string
	*/
	public String errorString ()
	{	return ErrorString;
	}
	
	/**
	The position, where the scan error occured.
	@return the error position.
	*/
	public int errorPos ()
	{	return Pos;
	}
	
	/**
	Match the regular expression against a string.
	@return true, if a match was found.
	*/
	public boolean match (String s)
	{	char A[]=s.toCharArray();
		return match(A,0);
	}
	
	public boolean match (char a[], int pos)
	{	A=a;
		Position p=new Position(A);
		int i,n=A.length-minLength;
		for (i=pos; i<=n; i++)
		{	p.pos(i);
			if (Left.match(p)) 
			{	StartMatch=i;
				EndMatch=p.pos();
				return true;
			}
		}
		return false;
	}
	
	/**
	@return start position of matching string.
	*/
	public int startMatch ()
	{	return StartMatch;
	}
	
	/**
	@return end of the matching string.
	*/
	public int endMatch ()
	{	return EndMatch;
	}

	/**
	A main() to test the scanner.
	*/
	public static void main (String args[])
	{	RegExp R=new RegExp(args[0],false);
		if (R.Valid)
		{	if (R.match(args[1]))
			{	System.out.println("Matched from "+R.StartMatch+
					" to "+R.EndMatch);
				System.out.println(R.EN+" brackets assigned");
				for (int i=0; i<R.EN; i++)
				{	System.out.println(i+": "+R.expand("("+i+")"));
				}
			}
		}
		else System.out.println(R.ErrorString+" at "+R.Pos);
	}

	/**
	Return an enumeration with the found brackets. The
	objects are instances of the PositionRange class.
	@see rene.regexp.PositionRange
	*/	
	public Enumeration getBrackets ()
	{	return E.elements();
	}
	
	public int getBracketNumber ()
	{	return E.size();
	}
	
	public String getBracket (int i)
	{	try
		{	PositionRange r=(PositionRange)E.elementAt(i);
			return new String(A,r.start(),r.end()-r.start());
		}
		catch (Exception e)
		{	return null;
		}
	}
	
	/**
	Expand the replacement string and change (1), (2)
	etc. to the found bracket expansions.
	@return expanded string or null on error.
	*/
	public String expand (String s)
	{	try	
		{	StringBuffer B=new StringBuffer();
			s=s.replace("\\t","\t");
			StringTokenizer T=new StringTokenizer(s,"\\()",true);
			while (T.hasMoreTokens())
			{	String a=T.nextToken();
				if (a.equals("("))
				{	String b=T.nextToken();
					String c=T.nextToken();
					if (!c.equals(")")) return null;
					PositionRange p=
						(PositionRange)E.elementAt(Integer.parseInt(b));
					B.append(new String(A,p.start(),p.end()-p.start()));
				}
				else if (a.equals("\\"))
				{	a=T.nextToken();
					B.append(a);
				}
				else B.append(a);
			}
			return B.toString();
		}
		catch (Exception e)
		{	return null;
		}
	}
	
	char uppercase (char c)
	{	if (IgnoreCase) return Character.toUpperCase(c);
		else return c;
	}
}