/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
@author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a>
*/
package edu.nd.nina.util;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CharSequenceLexer implements Lexer {
// Some predefined lexing rules
public static final Pattern LEX_ALPHA = Pattern.compile("\\p{Alpha}+");
public static final Pattern LEX_WORDS = Pattern.compile("\\w+");
public static final Pattern LEX_NONWHITESPACE_TOGETHER = Pattern
.compile("\\S+");
public static final Pattern LEX_WORD_CLASSES = Pattern
.compile("\\p{Alpha}+|\\p{Digit}+");
public static final Pattern LEX_NONWHITESPACE_CLASSES = Pattern
.compile("\\p{Alpha}+|\\p{Digit}+|\\p{Punct}");
// Lowercase letters and uppercase letters
public static final Pattern UNICODE_LETTERS = Pattern
.compile("[\\p{Ll}&&\\p{Lu}]+");
Pattern regex;
Matcher matcher = null;
CharSequence input;
String matchText;
boolean matchTextFresh;
public CharSequenceLexer() {
this(LEX_ALPHA);
}
public CharSequenceLexer(Pattern regex) {
this.regex = regex;
setCharSequence(null);
}
public CharSequenceLexer(String regex) {
this(Pattern.compile(regex));
}
public CharSequenceLexer(CharSequence input, Pattern regex) {
this(regex);
setCharSequence(input);
}
public CharSequenceLexer(CharSequence input, String regex) {
this(input, Pattern.compile(regex));
}
public void setCharSequence(CharSequence input) {
this.input = input;
this.matchText = null;
this.matchTextFresh = false;
if (input != null)
this.matcher = regex.matcher(input);
}
public CharSequence getCharSequence() {
return input;
}
public String getPattern() {
return regex.pattern();
}
public void setPattern(String reg)// added by Fuchun
{
if (!regex.equals(getPattern())) {
this.regex = Pattern.compile(reg);
// this.matcher = regex.matcher(input);
}
}
public int getStartOffset() {
if (matchText == null)
return -1;
return matcher.start();
}
public int getEndOffset() {
if (matchText == null)
return -1;
return matcher.end();
}
public String getTokenString() {
return matchText;
}
// Iterator interface methods
private void updateMatchText() {
if (matcher != null && matcher.find()) {
matchText = matcher.group();
if (matchText.length() == 0) {
// xxx Why would this happen?
// It is happening to me when I use the regex ".*" in an attempt
// to make
// Token's out of entire lines of text. -akm.
updateMatchText();
// System.err.println ("Match text is empty!");
}
// matchText = input.subSequence (matcher.start(),
// matcher.end()).toString ();
} else
matchText = null;
matchTextFresh = true;
}
public boolean hasNext() {
if (!matchTextFresh)
updateMatchText();
return (matchText != null);
}
public String next() {
if (!matchTextFresh)
updateMatchText();
matchTextFresh = false;
return matchText;
}
public void remove() {
throw new UnsupportedOperationException();
}
}