/* * Copyright 2005-2015 by BerryWorks Software, LLC. All rights reserved. * * This file is part of EDIReader. You may obtain a license for its use directly from * BerryWorks Software, and you may also choose to use this software under the terms of the * GPL version 3. Other products in the EDIReader software suite are available only by licensing * with BerryWorks. Only those files bearing the GPL statement below are available under the GPL. * * EDIReader is free software: you can redistribute it and/or modify it under the terms of the * GNU General Public License as published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * EDIReader is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with EDIReader. If not, * see <http://www.gnu.org/licenses/>. */ package com.berryworks.edireader.tokenizer; import com.berryworks.edireader.EDIReader; import java.io.IOException; import java.io.Reader; import java.nio.CharBuffer; /** * Interprets EDI input as a sequence of primitive syntactic tokens. * <p> * As an EDI interchange is parsed, the parser uses a Tokenizer to advance through the * input EDI stream one token at a time. A call to <code>nextToken()</code> causes the tokenizer to advance * past the next token and return a <code>Token</code> instance describing that token. * <p> * This implementation of Tokenizer uses CharBuffer instead of char[]. */ public class EDITokenizer extends AbstractTokenizer { public static final int BUFFER_SIZE = 1000; private final CharBuffer charBuffer = CharBuffer.wrap(new char[BUFFER_SIZE]); public EDITokenizer(Reader source) { super(source); charBuffer.flip(); if (EDIReader.debug) trace("Constructed a new EDITokenizer"); } public EDITokenizer(Reader source, char[] preRead) { this(source); if (preRead == null || preRead.length == 0) return; if (preRead.length > charBuffer.capacity()) throw new RuntimeException("Attempt to create EDITokenizer with " + preRead.length + " pre-read chars, which is greater than the internal buffer size of " + charBuffer.capacity()); charBuffer.clear(); charBuffer.put(preRead); charBuffer.flip(); } /** * Returns a String representation of the current state of the tokenizer * for testing and debugging purposes. * * @return String representation */ @Override public String toString() { String result = "tokenizer state:"; result += " segmentCount=" + segmentCount; result += " charCount=" + charCount; result += " segTokenCount=" + segTokenCount; result += " segCharCount=" + segCharCount; result += " currentToken=" + currentToken; result += " buffer.limit=" + charBuffer.limit(); result += " buffer.position=" + charBuffer.position(); return result; } /** * Gets the next character of input. Sets cChar and cClass * * @throws IOException for problem reading EDI data */ public void getChar() throws IOException { if (unGot) { // The current character has been "put back" with ungetChar() // after having been seen with getChar(). Therefore, this call // to getChar() can simply reget the current character. unGot = false; charCount++; segCharCount++; return; } // Read a fresh character from the input source. // But first copy the current one to an outputWriter // or the recorder if necessary. if (outputWriter != null) { // We do have an outputWriter wanting data, but do we have // a current character to write? And make sure writing is // not suspended. if ((!endOfFile) && (!writingSuspended)) outputWriter.write(cChar); } if (recorderOn) recording.append(cChar); if (charBuffer.remaining() == 0) { readUntilBufferProvidesAtLeast(1); } if (endOfFile) { cClass = CharacterClass.EOF; if (EDIReader.debug) trace("end-of-file encountered"); } else { cChar = charBuffer.get(); if (cChar == delimiter) cClass = CharacterClass.DELIMITER; else if (cChar == subDelimiter) cClass = CharacterClass.SUB_DELIMITER; else if (cChar == release) cClass = CharacterClass.RELEASE; else if (cChar == terminator) cClass = CharacterClass.TERMINATOR; else if (cChar == repetitionSeparator) cClass = CharacterClass.REPEAT_DELIMITER; else cClass = CharacterClass.DATA; } charCount++; segCharCount++; } /** * Gets the remaining chars that have been read into the buffer * and not returned by getChars(n) or equivalant. Chars previewed * by lookahead(n) are not considered to have been used and therefore * are included among the chars returned by getBuffered. * * The use of getBuffered() is intended for only very special situations. * For example, if an input stream contains multiple fully independent EDI * interchanges -- perhaps from different EDI standards -- it is useful to * logically "start from scratch" on each successive interchange, with new * parser, tokenizer, buffer, etc, with any chars remaining in the buffer * from the previous interchange to be used as new data. * * @return chars of unprocessed input data */ public char[] getBuffered() { char[] result = new char[0]; // if (endOfFile) // return result; if (charBuffer.remaining() == 0 && !unGot) { return result; } try { int n = charBuffer.remaining(); if (endOfFile && n == 0) { // Special case: if we've hit eof and the charBuffer is empty // ignore an unGot char if there is one. } else { n += unGot ? 1 : 0; } result = lookahead(n); } catch (Exception ignore) { } return result; } /** * Look ahead into the source of input chars and return the next n chars to * be seen, without disturbing the normal operation of getChar(). * * @param n number of chars to return * @return char[] containing upcoming input chars * @throws IOException for problem reading EDI data */ public char[] lookahead(int n) throws IOException { if (EDIReader.debug) trace("EDITokenizer.lookahead(" + n + ")"); char[] rval = new char[n]; // The 1st char is grabbed using the tokenizer's built-in // getChar() / ungetChar() mechanism. This allows things to work // properly whether or not the next char has already been gotten. getChar(); rval[0] = cChar; ungetChar(); // The minus 1 is because we have already filled the first char of the return value, so we only need n-1 more if (charBuffer.remaining() < n - 1) { if (EDIReader.debug) trace("Buffering more data to satisfy lookahead(" + n + ")"); readUntilBufferProvidesAtLeast(n - 1); } // Move chars from the buffer into the return value int j = 1; for (int i = charBuffer.position(); i < charBuffer.limit() && j < n; i++) rval[j++] = charBuffer.get(i); // If more lookahead chars were requested than were satisfied for any reason, // then fill the return value with '?' to the requested length. for (; j < n; ) { rval[j++] = '?'; // throw new RuntimeException("problem with lookahead " + n); } return rval; } private void readUntilBufferProvidesAtLeast(int needed) throws IOException { int remaining; while ((remaining = charBuffer.remaining()) < needed) { if (EDIReader.debug) trace("Reading from input stream because at least " + needed + " chars are needed and only " + remaining + " are avilalble"); charBuffer.compact(); int n; while ((n = inputReader.read(charBuffer)) == 0) { } charBuffer.flip(); if (n < 0) { if (EDIReader.debug) trace("Hit end of file on the input stream"); endOfFile = true; break; } else { if (EDIReader.debug) trace("Number of chars read from input stream: " + n); } } } }