Tokenizer.java example

Explorer
jena-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.jena.util;

import java.util.NoSuchElementException;

/**
 * A tokenizer, similar to java's StringTokenizer but allows for quoted
 * character strings which can include other separators.
 */
public class Tokenizer {
    
    /** The string being parsed */
    protected String source;
    
    /** The index of the first unreturned char in source */
    protected int p;

    /** The set of delimiter characters */
    protected String delim;
    
    /** If true then delimiters should be returned as tokens */
    protected boolean returnDelims;
    
    /** Literal string delimiters */
    protected String literalDelim;
    
    /** The lex state */
    protected int state;
    
    /** A lookahead for tokens */
    protected String lookahead;
    
    /** State flag: normal parse */
    protected static final int NORMAL = 1;
    
    /** State flag: start of literal */
    protected static final int LITERAL_START = 2;
    
    /** State flag: end of literal */
    protected static final int LITERAL_END = 3;
    
    /**
     * Constructor.
     * @param str the source string to be parsed
     * @param delim The set of delimiter characters
     * @param literalDelim Literal string delimiters
     * @param returnDelims If true then delimiters should be returned as tokens
     */
    public Tokenizer(String str, String delim, String literalDelim, boolean returnDelims) {
        this.source = str;
        this.delim = delim;
        this.literalDelim = literalDelim;
        this.returnDelims = returnDelims;
        p = 0; 
        state = NORMAL;
    }
    
    /**
     * Return the next token.
     * @throws java.util.NoSuchElementException if there are no more tokens available
     */
    public String nextToken() {
        String result = null;
        if (lookahead != null) {
            result = lookahead;
            lookahead = null;
        } else {
            result = getNextToken();
        }
        if (result == null) {
            throw new NoSuchElementException("No more elements in tokenized string");
        } 
        if (!returnDelims) {
            if (result.length() == 1) {
                char c = result.charAt(0);
                if (delim.indexOf(c) != -1 || literalDelim.indexOf(c) != -1) {
                    return nextToken();
                }
            }
        }
        return result;
    }
    
    /**
     * Test if there are more tokens which can be returned.
     */
    public boolean hasMoreTokens() {
        if (lookahead == null) lookahead = getNextToken();
        return lookahead != null;
    }
    
    /**
     * Find the next token which can either be a delimiter or a real token.
     */
    private String getNextToken() {
        if (p >= source.length()) {
            return null;
        }
        switch(state) {
        case NORMAL:
            if (is(literalDelim)) {
                state = LITERAL_START;
                p++;
                return source.substring(p-1, p);
            } else if (is(delim)) {
                p++;
                return source.substring(p-1, p);
            } else {
                int start = p;
                p++;
                while (p < source.length() && ! is(delim)) p++;
                return source.substring(start, p);
            }
        case LITERAL_START:
            char delim = source.charAt(p-1);
            StringBuilder literal = new StringBuilder();
            while (p < source.length()) {
                char c = source.charAt(p);
                if (c == '\\') {
                    p++;
                    if (p >= source.length()) break;
                    c = source.charAt(p);
                } else {
                    if (c == delim) break;
                }
                literal.append(c);
                p++;
            }
            state = LITERAL_END;
            return literal.toString();
        case LITERAL_END:
            state = NORMAL;
            p++;
            return source.substring(p-1, p);
        }
        return null;
    }
    
    
    /**
     * Returns true if the current character is contained in the given classification.
     */
    private boolean is(String classification) {
        return classification.indexOf(source.charAt(p)) != -1;
    }
}