/***********************************************************************************************************************
* Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu)
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
* specific language governing permissions and limitations under the License.
**********************************************************************************************************************/
package eu.stratosphere.types.parser;
import java.nio.charset.Charset;
/**
* Converts a variable length field of a byte array into a {@link String}. The byte contents between
* delimiters is interpreted as an ASCII string. The string may be quoted in double quotes. For quoted
* strings, whitespaces (space and tab) leading and trailing before and after the quotes are removed.
*/
public class AsciiStringParser extends FieldParser<String> {
// the default (ascii style) charset. should be available really everywhere.
private static final Charset CHARSET = Charset.forName("ISO-8859-1");
private static final byte WHITESPACE_SPACE = (byte) ' ';
private static final byte WHITESPACE_TAB = (byte) '\t';
private static final byte QUOTE_DOUBLE = (byte) '"';
private String result;
@Override
public int parseField(byte[] bytes, int startPos, int limit, char delim, String reusable) {
int i = startPos;
final byte delByte = (byte) delim;
byte current;
// count initial whitespace lines
while (i < limit && ((current = bytes[i]) == WHITESPACE_SPACE || current == WHITESPACE_TAB)) {
i++;
}
// first none whitespace character
if (i < limit && bytes[i] == QUOTE_DOUBLE) {
// quoted string
i++; // the quote
// we count only from after the quote
int quoteStart = i;
while (i < limit && bytes[i] != QUOTE_DOUBLE) {
i++;
}
if (i < limit) {
// end of the string
this.result = new String(bytes, quoteStart, i-quoteStart, CHARSET);
i++; // the quote
// skip trailing whitespace characters
while (i < limit && (current = bytes[i]) != delByte) {
if (current == WHITESPACE_SPACE || current == WHITESPACE_TAB) {
i++;
}
else {
setErrorState(ParseErrorState.UNQUOTED_CHARS_AFTER_QUOTED_STRING);
return -1; // illegal case of non-whitespace characters trailing
}
}
return (i == limit ? limit : i+1);
} else {
// exited due to line end without quote termination
setErrorState(ParseErrorState.UNTERMINATED_QUOTED_STRING);
return -1;
}
}
else {
// unquoted string
while (i < limit && bytes[i] != delByte) {
i++;
}
// set from the beginning. unquoted strings include the leading whitespaces
this.result = new String(bytes, startPos, i-startPos, CHARSET);
return (i == limit ? limit : i+1);
}
}
@Override
public String createValue() {
return "";
}
@Override
public String getLastResult() {
return this.result;
}
}