SimpleLexer.java example

Explorer
wekax-master
- weka-3-6-2
- wekaUT
  - GetAllSubPackages.java
  - weka
package weka.datagenerators;

import weka.core.Option;
import weka.core.Utils;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;

/**
 * Splits a given input string by matching a particular pattern.
 * There are three lexing styles, namely <i>whitespace</i>,
 * <i>alphanum</i>, and <i>alpha</i>.
 *
 * <p>The <i>whitespace</i> style keeps all strings that begin and end
 * with non-whitespace characters, separated by whitespace.  For
 * example, the string "Austin, TX 78712-1188" will result in the
 * tokens "Austin,", "TX," and "78712-1188".
 *
 * <p>The <i>alphanum</i> style keeps all alphanumeric strings, separated
 * by non-alphanumeric characters.  For example, the string "Austin,
 * TX 78712-1188" will result in the tokens "Austin", "TX", "78712"
 * and "1188".
 *
 * <p>The <i>alpha</i> style keeps all alphabetic strings, separated by
 * non-alphabetic characters.  For example, the string "Austin, TX
 * 78712-1188" will result in the tokens "Austin" and "TX".
 *
 * <p><b>WEKA options:</b>
 * <ul>
 *   <li><code>-y <str></code> - The lexing style, which is
 *   one of <code>whitespace</code>, <code>alphanum</code>, or
 *   <code>alpha</code>.  This parameter has no default value and is
 *   not optional.
 * </ul>
 *
 * @author ywwong
 * @version $Id: SimpleLexer.java,v 1.1.1.1 2003/01/22 07:48:27 mbilenko Exp $
 */
class SimpleLexer extends Lexer {

    public static final int WHITESPACE = 0;
    public static final int ALPHANUM = 1;
    public static final int ALPHA = 2;

    /** Unit of size whereby the character buffer is increased. */
    protected static final int INC = 50;

    /** The character buffer. */
    protected char[] m_buf;

    /** The lexing style. */
    protected int m_nStyle;

    ////// WEKA specific. //////

    protected String m_strStyle;

    ////// Ends WEKA specific. //////

    /**
     * Creates a simple lexer.
     *
     * @param ts      The TextSource object.
     * @param reader  The document reader.
     */
    public SimpleLexer(TextSource ts, DocumentReader reader,
                       String[] options) throws Exception {
        super(reader);

        ////// WEKA specific. //////

        m_strStyle = Utils.getOption('y', options);
        if (m_strStyle.length() == 0)
            throw new Exception("Style (-y) not set.");
        else if (m_strStyle.equals("whitespace"))
            m_nStyle = WHITESPACE;
        else if (m_strStyle.equals("alphanum"))
            m_nStyle = ALPHANUM;
        else if (m_strStyle.equals("alpha"))
            m_nStyle = ALPHA;
        else
            throw new Exception("Invalid style (-y): \'" + m_strStyle + "\'.");

        ////// Ends WEKA specific. //////

        m_buf = new char[INC];
    }

    /**
     * Parses the next token from the input string.
     * 
     * @return The next token if it's available; <code>null</code> if
     * otherwise.
     */
    public String nextToken() throws IOException {
        char ch = 0;
        boolean b;
        int c;
        int i;

        // Skip separator.
        c = m_reader.read();
        while (c >= 0) {
            b = false;
            ch = (char) c;
            switch (m_nStyle) {
            case WHITESPACE:
                b = !Character.isWhitespace(ch);
                break;
            case ALPHANUM:
                b = Character.isLetterOrDigit(ch);
                break;
            case ALPHA:
                b = Character.isLetter(ch);
                break;
            }
            if (b)
                break;
            c = m_reader.read();
        }

        if (c < 0)
            return null;

        // Find the token.
        i = 1;
        m_buf[0] = ch;
        c = m_reader.read();
        while (c >= 0) {
            b = false;
            ch = (char) c;
            switch (m_nStyle) {
            case WHITESPACE:
                b = Character.isWhitespace(ch);
                break;
            case ALPHANUM:
                b = !Character.isLetterOrDigit(ch);
                break;
            case ALPHA:
                b = !Character.isLetter(ch);
                break;
            }
            if (b)
                break;

            if (i == m_buf.length) {
                char[] newBuf = new char[i + INC];

                for (int j = 0; j < i; j++)
                    newBuf[j] = m_buf[j];
                m_buf = newBuf;
            }
            m_buf[i++] = ch;
            c = m_reader.read();
        }
        return new String(m_buf, 0, i);
    }

    ////// WEKA specific. //////

    public static Collection listOptions() {
        ArrayList aOpts;

        aOpts = new ArrayList();
        aOpts.add(new Option("\tSimpleLexer: Lexing style",
                             "y", 1, "-y <str>"));
        return aOpts;
    }

    public Collection getOptions() {
        ArrayList aOpts;

        aOpts = new ArrayList();
        aOpts.add("-y");
        aOpts.add(m_strStyle);
        return aOpts;
    }

}