package weka.datagenerators;
import weka.core.Option;
import weka.core.Utils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
/**
* Splits a given input string by matching a particular pattern.
* There are three lexing styles, namely <i>whitespace</i>,
* <i>alphanum</i>, and <i>alpha</i>.
*
* <p>The <i>whitespace</i> style keeps all strings that begin and end
* with non-whitespace characters, separated by whitespace. For
* example, the string "Austin, TX 78712-1188" will result in the
* tokens "Austin,", "TX," and "78712-1188".
*
* <p>The <i>alphanum</i> style keeps all alphanumeric strings, separated
* by non-alphanumeric characters. For example, the string "Austin,
* TX 78712-1188" will result in the tokens "Austin", "TX", "78712"
* and "1188".
*
* <p>The <i>alpha</i> style keeps all alphabetic strings, separated by
* non-alphabetic characters. For example, the string "Austin, TX
* 78712-1188" will result in the tokens "Austin" and "TX".
*
* <p><b>WEKA options:</b>
* <ul>
* <li><code>-y <str></code> - The lexing style, which is
* one of <code>whitespace</code>, <code>alphanum</code>, or
* <code>alpha</code>. This parameter has no default value and is
* not optional.
* </ul>
*
* @author ywwong
* @version $Id: SimpleLexer.java,v 1.1.1.1 2003/01/22 07:48:27 mbilenko Exp $
*/
class SimpleLexer extends Lexer {
public static final int WHITESPACE = 0;
public static final int ALPHANUM = 1;
public static final int ALPHA = 2;
/** Unit of size whereby the character buffer is increased. */
protected static final int INC = 50;
/** The character buffer. */
protected char[] m_buf;
/** The lexing style. */
protected int m_nStyle;
////// WEKA specific. //////
protected String m_strStyle;
////// Ends WEKA specific. //////
/**
* Creates a simple lexer.
*
* @param ts The TextSource object.
* @param reader The document reader.
*/
public SimpleLexer(TextSource ts, DocumentReader reader,
String[] options) throws Exception {
super(reader);
////// WEKA specific. //////
m_strStyle = Utils.getOption('y', options);
if (m_strStyle.length() == 0)
throw new Exception("Style (-y) not set.");
else if (m_strStyle.equals("whitespace"))
m_nStyle = WHITESPACE;
else if (m_strStyle.equals("alphanum"))
m_nStyle = ALPHANUM;
else if (m_strStyle.equals("alpha"))
m_nStyle = ALPHA;
else
throw new Exception("Invalid style (-y): \'" + m_strStyle + "\'.");
////// Ends WEKA specific. //////
m_buf = new char[INC];
}
/**
* Parses the next token from the input string.
*
* @return The next token if it's available; <code>null</code> if
* otherwise.
*/
public String nextToken() throws IOException {
char ch = 0;
boolean b;
int c;
int i;
// Skip separator.
c = m_reader.read();
while (c >= 0) {
b = false;
ch = (char) c;
switch (m_nStyle) {
case WHITESPACE:
b = !Character.isWhitespace(ch);
break;
case ALPHANUM:
b = Character.isLetterOrDigit(ch);
break;
case ALPHA:
b = Character.isLetter(ch);
break;
}
if (b)
break;
c = m_reader.read();
}
if (c < 0)
return null;
// Find the token.
i = 1;
m_buf[0] = ch;
c = m_reader.read();
while (c >= 0) {
b = false;
ch = (char) c;
switch (m_nStyle) {
case WHITESPACE:
b = Character.isWhitespace(ch);
break;
case ALPHANUM:
b = !Character.isLetterOrDigit(ch);
break;
case ALPHA:
b = !Character.isLetter(ch);
break;
}
if (b)
break;
if (i == m_buf.length) {
char[] newBuf = new char[i + INC];
for (int j = 0; j < i; j++)
newBuf[j] = m_buf[j];
m_buf = newBuf;
}
m_buf[i++] = ch;
c = m_reader.read();
}
return new String(m_buf, 0, i);
}
////// WEKA specific. //////
public static Collection listOptions() {
ArrayList aOpts;
aOpts = new ArrayList();
aOpts.add(new Option("\tSimpleLexer: Lexing style",
"y", 1, "-y <str>"));
return aOpts;
}
public Collection getOptions() {
ArrayList aOpts;
aOpts = new ArrayList();
aOpts.add("-y");
aOpts.add(m_strStyle);
return aOpts;
}
}