/* * Copyright 2005-2015 by BerryWorks Software, LLC. All rights reserved. * * This file is part of EDIReader. You may obtain a license for its use directly from * BerryWorks Software, and you may also choose to use this software under the terms of the * GPL version 3. Other products in the EDIReader software suite are available only by licensing * with BerryWorks. Only those files bearing the GPL statement below are available under the GPL. * * EDIReader is free software: you can redistribute it and/or modify it under the terms of the * GNU General Public License as published by the Free Software Foundation, either version 3 of * the License, or (at your option) any later version. * * EDIReader is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License along with EDIReader. If not, * see <http://www.gnu.org/licenses/>. */ package com.berryworks.edireader.tokenizer; import com.berryworks.edireader.EDIAbstractReader; import com.berryworks.edireader.EDIReader; import com.berryworks.edireader.EDISyntaxException; import com.berryworks.edireader.error.ErrorMessages; import org.xml.sax.SAXException; import java.io.IOException; import java.io.Reader; import java.io.Writer; import java.util.ArrayList; import java.util.List; public abstract class AbstractTokenizer implements Tokenizer, ErrorMessages { protected enum State { EXPECTING_SEGMENT, IN_SEGMENT, IN_COMPOSITE } protected enum CharacterClass { DATA, DELIMITER, SUB_DELIMITER, RELEASE, TERMINATOR, REPEAT_DELIMITER, EOF } protected CharacterClass cClass; protected State state; protected Writer outputWriter; protected boolean writingSuspended; protected final StringBuilder recording = new StringBuilder(); protected boolean recorderOn; protected int segmentCount; protected int segTokenCount; protected int charCount; protected int segCharCount; protected final Reader inputReader; protected char delimiter = '+'; protected char subDelimiter = ':'; protected char subSubDelimiter = '&'; // release is an int instead of a char so that it can hold // a char value (as a positive int) or an indicator of // "no release char" (an int value of -1). protected int release = -1; // repetitionSeparator is an int instead of a char so that it can hold // a char value (as a positive int) or an indicator of // "no repeating fields" (an int value of -1) protected int repetitionSeparator = -1; protected char terminator = '.'; protected boolean tokenReady; protected boolean repetition; protected boolean endOfFile; protected final Token currentToken; protected char cChar; protected boolean unGot; /** * Gets the count of segments that have been read or partially read. * * @return The segmentCount value */ public int getSegmentCount() { return segmentCount; } public int getElementInSegmentCount() { return segTokenCount; } public int getCharCount() { return charCount; } public int getSegmentCharCount() { return segCharCount; } @Override public void setCharCounts(int charCount, int segmentCharCount) { this.charCount = charCount; this.segCharCount = segmentCharCount; } public Reader getReader() { return inputReader; } public char getSubSubDelimiter() { return subSubDelimiter; } public void setSubSubDelimiter(char ssd) { subSubDelimiter = ssd; } /** * Sets the release character * * @param e The new release value */ public void setRelease(int e) { release = e; } /** * Gets the character used to delimit repeating fields. * * @return The repetition char, or -1 if no repetition char is in effect */ public int getRepetitionSeparator() { return repetitionSeparator; } /** * Sets the character used to delimit repeating fields. * * @param e The new value */ public void setRepetitionSeparator(int e) { // In EDITokenizer, -1 for a repetition char means that none is in effect. // An attempt to set it to zero is interpreted as an alternate way to indicate // that no repetition char is in effect, so we set the value to -1 for that // case as well. repetitionSeparator = e > 0 ? e : -1; } public void setTerminator(char d) { terminator = d; } public char getTerminator() { return terminator; } public char getDelimiter() { return delimiter; } public void setDelimiter(char d) { delimiter = d; } public char getSubDelimiter() { return subDelimiter; } public void setSubDelimiter(char sd) { subDelimiter = sd; } public void copy(char c) { if (outputWriter != null) try { outputWriter.write(c); } catch (IOException e) { // ignore } } public boolean isEndOfData() { return endOfFile; } /** * Returns the value of the next token, expected to be of type SIMPLE or * EMPTY. If <code>required</code> is true, then it may not be EMPTY. A * syntax exception is thrown for any other types. * * @param required an EMPTY token is not allowed * @param returnNullAtSegmentEnd governs behavior at end of segment * @return String value of the token * @throws SAXException unexpected tokens * @throws IOException for problem reading EDI data */ public String nextSimpleValue(boolean required, boolean returnNullAtSegmentEnd) throws SAXException, IOException { Token t = nextToken(); switch (t.getType()) { case EMPTY: if (required) throw new EDISyntaxException("Mandatory element missing in " + t.getSegmentType() + " segment", this); break; case SEGMENT_END: if (required) throw new EDISyntaxException("Mandatory element missing in " + t.getSegmentType() + " segment", this); else if (returnNullAtSegmentEnd) return null; else break; case SIMPLE: break; default: throw new EDISyntaxException(EXPECTED_SIMPLE_TOKEN, this); } return t.getValue(); } /** * Returns the value of the next token, expected to be of type SIMPLE or * EMPTY. If <code>required</code> is true, then it may not be EMPTY. A * syntax exception is thrown for any other types. * * @param required an EMPTY token is not allowed * @return String value of the token * @throws SAXException unexpected tokens * @throws IOException for problem reading EDI data */ public String nextSimpleValue(boolean required) throws SAXException, IOException { return nextSimpleValue(required, false); } /** * Equivalent to <code>nextSimpleValue(true)</code> * * @return String value of the token * @throws SAXException unexpected tokens * @throws IOException for problem reading EDI data */ public String nextSimpleValue() throws SAXException, IOException { return nextSimpleValue(true); } /** * Gets the next token expecting it to be a digit sequence. * * @return value integer value implied by digits * @throws SAXException for SAX compatibility * @throws IOException for problem reading EDI data * @throws com.berryworks.edireader.EDISyntaxException if contains non-digits * if empty */ public int nextIntValue() throws SAXException, IOException { int i; try { i = Integer.parseInt(nextSimpleValue()); } catch (NumberFormatException e) { throw new EDISyntaxException(DIGITS_ONLY, this); } return i; } /** * Parses the next token expecting to find a composite element - one * composed of subelements separated by the subElementDelimiter. * * @return subelements as a Vector of Strings * @throws IOException for problem reading EDI data * @throws com.berryworks.edireader.EDISyntaxException if invalid EDI is detected */ public List<String> nextCompositeElement() throws IOException, EDISyntaxException { return nextCompositeElement(false); } public List<String> nextCompositeElement(boolean returnNullAtSegmentEnd) throws IOException, EDISyntaxException { List<String> result = new ArrayList<>(); loop: while (true) { Token t = nextToken(); switch (t.getType()) { case SUB_ELEMENT: // add this token's value to the list and // others that follow it result.add(t.getValue()); if (t.isLast()) break loop; break; case SUB_EMPTY: result.add(""); if (t.isLast()) break loop; break; case SIMPLE: // We saw a simple token terminated by a normal // element delimiter, not the subElement delimiter. // Treat this as a composite element with only one // value. result.add(t.getValue()); break loop; case EMPTY: // An empty token terminated by // a normal element delimiter, segment end, etc. // Treat this as a composite element with no values // by returning an empty List. break loop; case SEGMENT_END: if (returnNullAtSegmentEnd) return null; break loop; default: throw new EDISyntaxException(INVALID_COMPOSITE, this); } } return result; } /** * Arranges for the token most recently returned by <code>nextToken</code> * to be returned again if a future call to <code>nextToken</code>. */ public void ungetToken() { tokenReady = true; } /** * Arranges for getChar() to see the current char again the next time it is * called, in effect "putting back" that char to be seen again. */ public void ungetChar() { unGot = true; charCount--; segCharCount--; } /** * Returns the next Token from the InputSource, or null if there are no more * tokens. * * @return Token * @throws IOException for problem reading EDI data * @throws com.berryworks.edireader.EDISyntaxException if invalid EDI is detected */ public Token nextToken() throws IOException, EDISyntaxException { if (!tokenReady) advance(); tokenReady = false; return currentToken; } /** * Peeks ahead to determine if nextToken() would find another token. * * @return true if there is another token available to nextToken() * @throws IOException for problem reading EDI data * @throws com.berryworks.edireader.EDISyntaxException if invalid EDI is detected */ public boolean hasMoreTokens() throws IOException, EDISyntaxException { if (!tokenReady) advance(); return tokenReady; } /** * Skips over tokens until the beginning of a new segment is encountered. * * @return segType String containing value of leading field * @throws SAXException for problem emitting SAX events * @throws IOException for problem reading EDI data */ public String nextSegment() throws SAXException, IOException { Token t; int i = 0; while (true) { t = nextToken(); Token.TokenType tokenType = t.getType(); if (tokenType == Token.TokenType.SEGMENT_START) break; if (tokenType == Token.TokenType.END_OF_DATA) throw new EDISyntaxException(UNEXPECTED_EOF, this); if (++i > 30) throw new EDISyntaxException("Too many fields for " + t.getSegmentType() + " segment (Segment terminator problem?)", this); } return t.getSegmentType(); } /** * Skips over tokens until an END_SEGMENT token is reached, marking the end * of the current segment. This Tokenizer is therefore positioned so that * the next call to getToken() sees the first token after the end of this * segment. * * @return token SEGMENT_END * @throws SAXException for problem emitting SAX events * @throws IOException for problem reading EDI data */ public Token skipSegment() throws SAXException, IOException { Token t; int i = 0; while (true) { t = nextToken(); Token.TokenType tokenType = t.getType(); if ((tokenType == Token.TokenType.SEGMENT_END) || (tokenType == Token.TokenType.END_OF_DATA)) break; if (++i > 30) throw new EDISyntaxException("Too many fields in " + t.getSegmentType() + " segment", this); } return t; } /** * Scans a series of data characters up to the first character other than a * data character. * <p> * Each character is appended to the value of the current token. Upon * return, cChar and cClass are left referencing the character after the * last character of data; in other words, getChar() will have been called * seeing something other than a character of data. Release sequences are * handled within this method. * * @param limit maximum number of data characters allowed in the element * @return class of char that caused scan to stop * @throws IOException problem reading EDI input * @throws com.berryworks.edireader.EDISyntaxException specific syntax error in parsed EDI data */ protected CharacterClass scanData(int limit) throws IOException, EDISyntaxException { loop: while (true) { getChar(); switch (cClass) { case RELEASE: // Ignore this release character, but get the next // character and treat it as data (by falling in // to the following case) without regard to the class // that character would naturally be. getChar(); case DATA: if (--limit == 0) throw new EDISyntaxException(ELEMENT_TOO_LONG, this); currentToken.append(cChar); break; case SUB_DELIMITER: break loop; case REPEAT_DELIMITER: repetition = true; break loop; case TERMINATOR: ungetChar(); // fall into the default logic below default: repetition = false; break loop; } } return cClass; } /** * Equivalent to scanData(infinite) * * @return class of char that caused scan to stop * @throws IOException for problem reading EDI data * @throws com.berryworks.edireader.EDISyntaxException if invalid EDI is detected */ protected CharacterClass scanData() throws IOException, EDISyntaxException { return scanData(0); } /** * Scans over a series of characters that, after a segment terminator, are * considered to be ignorable whitespace. This allows segments with formal * segment terminator characters to be followed be line-oriented characters * (line feeds and carriage returns). * * @throws IOException for problem reading EDI data */ public void scanTerminatorSuffix() throws IOException { do { getChar(); } while (cClass != CharacterClass.EOF && WHITESPACE.indexOf(cChar) != -1); ungetChar(); } public char[] getChars(int n) throws IOException, EDISyntaxException { char[] result = new char[n]; for (int i = 0; i < n; i++) { getChar(); if (cClass == CharacterClass.EOF) throw new EDISyntaxException("Encountered end of data unexpectedly after reading " + i + " characters of an expected " + n + " character sequence"); result[i] = cChar; } return result; } /** * The outputWriter provides the service of copying parsed data to an output * destination. This is particularly useful in splitting applications. This * service is optional; setWriter(null) turns copying off, which is the * default condition. In addition, copying can be suspended and then resumed * via the suspendWriting() method. Note that with writing suspended, the * copyToken() and similar methods are still available for manual copying of * data to the output destination. * * @param writer to receive a copy EDI data as it is read, if null then copying is disabled */ public void setWriter(Writer writer) { outputWriter = writer; } /** * Return the recording. * * @return The recording value */ public String getRecording() { return recording.toString(); } /** * Shorthand for EDIReader.trace(String) * * @param string text message to appear in trace */ protected void trace(String string) { EDIAbstractReader.trace(string); } /** * Used in conjunction with setWriter to temporarily suspend and then resume * copying parsed data to an output destination. * * @param b true suspends copying, false enables it again */ public void suspendWriting(boolean b) { writingSuspended = b; } /** * Turn the recorder on (true) or off (false). * * @param b The new recorder value */ public void setRecorder(boolean b) { recorderOn = b; if (EDIReader.debug) trace("recorder turned " + (b ? "on" : "off")); } public AbstractTokenizer(Reader source) { state = State.EXPECTING_SEGMENT; outputWriter = null; inputReader = source; tokenReady = false; currentToken = new TokenImpl(this); } /** * Advances to the next token. Sets tokenReady, currentToken, and state. * * @throws IOException for problem reading EDI data * @throws com.berryworks.edireader.EDISyntaxException if invalid EDI is detected */ protected void advance() throws IOException, EDISyntaxException { getChar(); tokenReady = true; switch (cClass) { case RELEASE: // Ignore this release character, but get the next // character and treat it as data (by falling in // to the following case) without regard to the class // that character would naturally be. getChar(); case DATA: switch (state) { case IN_SEGMENT: segTokenCount++; currentToken.setType(Token.TokenType.SIMPLE); currentToken.setValue(cChar); if (!repetition) currentToken.incrementIndex(); currentToken.resetSubElementIndex(); if (scanData() == CharacterClass.SUB_DELIMITER) { // We have a composite token instead of a simple one currentToken.setType(Token.TokenType.SUB_ELEMENT); currentToken.setLast(false); state = State.IN_COMPOSITE; } break; case IN_COMPOSITE: segTokenCount++; currentToken.setType(Token.TokenType.SUB_ELEMENT); currentToken.incrementSubElementIndex(); currentToken.setValue(cChar); if (scanData() != CharacterClass.SUB_DELIMITER) { // We hit something that marks the end of a series of subelements state = State.IN_SEGMENT; currentToken.setLast(true); } break; default: // We are at the beginning of a segment segmentCount++; segTokenCount = 1; segCharCount = 1; currentToken.setType(Token.TokenType.SEGMENT_START); currentToken.setValue(cChar); currentToken.resetIndexes(); scanData(10); currentToken.setSegmentType(currentToken.getValue()); state = State.IN_SEGMENT; } break; case TERMINATOR: switch (state) { case IN_COMPOSITE: // return an empty subelement token, marked as last, // before returning the segment terminator token. currentToken.incrementSubElementIndex(); currentToken.setLast(true); currentToken.setType(Token.TokenType.SUB_EMPTY); currentToken.resetValue(); ungetChar(); // change state so that next time // we will go down a different path. state = State.IN_SEGMENT; break; default: currentToken.setType(Token.TokenType.SEGMENT_END); state = State.EXPECTING_SEGMENT; scanTerminatorSuffix(); currentToken.resetSubElementIndex(); } break; case DELIMITER: switch (state) { case IN_COMPOSITE: // return an empty subelement token, marked as last, // before returning the delimiter token. currentToken.incrementSubElementIndex(); currentToken.setType(Token.TokenType.SUB_EMPTY); state = State.IN_SEGMENT; break; default: segTokenCount++; currentToken.incrementIndex(); currentToken.resetSubElementIndex(); currentToken.setType(Token.TokenType.EMPTY); } currentToken.setLast(true); currentToken.resetValue(); break; case SUB_DELIMITER: switch (state) { case IN_SEGMENT: if (!repetition) currentToken.incrementIndex(); state = State.IN_COMPOSITE; currentToken.resetSubElementIndex(); break; case IN_COMPOSITE: currentToken.incrementSubElementIndex(); } currentToken.setLast(false); currentToken.setType(Token.TokenType.SUB_EMPTY); currentToken.resetValue(); break; case REPEAT_DELIMITER: switch (state) { case IN_COMPOSITE: // return an empty subelement token, marked as last currentToken.incrementSubElementIndex(); currentToken.setLast(true); currentToken.setType(Token.TokenType.SUB_EMPTY); currentToken.resetValue(); state = State.IN_SEGMENT; repetition = true; break; case IN_SEGMENT: // return an empty element if (!repetition) { currentToken.incrementIndex(); } currentToken.resetSubElementIndex(); currentToken.setLast(false); currentToken.setType(Token.TokenType.EMPTY); currentToken.resetValue(); repetition = true; } break; case EOF: currentToken.setType(Token.TokenType.END_OF_DATA); tokenReady = false; break; } } }