/**
* Copyright 2011-2017 Asakusa Framework Team.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.asakusafw.runtime.io;
import static com.asakusafw.runtime.io.TsvConstants.*;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.math.BigDecimal;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.StandardCharsets;
import java.text.MessageFormat;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.io.Text;
import com.asakusafw.runtime.value.BooleanOption;
import com.asakusafw.runtime.value.ByteOption;
import com.asakusafw.runtime.value.DateOption;
import com.asakusafw.runtime.value.DateTimeOption;
import com.asakusafw.runtime.value.DateUtil;
import com.asakusafw.runtime.value.DecimalOption;
import com.asakusafw.runtime.value.DoubleOption;
import com.asakusafw.runtime.value.FloatOption;
import com.asakusafw.runtime.value.IntOption;
import com.asakusafw.runtime.value.LongOption;
import com.asakusafw.runtime.value.ShortOption;
import com.asakusafw.runtime.value.StringOption;
import com.asakusafw.runtime.value.ValueOption;
/**
* Reads TSV format text and set each cell into {@link ValueOption}.
* Each method in this class may raise {@link NullPointerException} if parameters were {@code null}.
*/
@SuppressWarnings("deprecation")
public final class TsvParser implements RecordParser {
private static final Pattern SPECIAL_FLOAT =
Pattern.compile("(\\+?Inf.*)|(-Inf.*)|((\\+|-)?[Nn]a[Nn])"); //$NON-NLS-1$
private static final int SPECIAL_FLOAT_POSITIVE_INF = 1;
private static final int SPECIAL_FLOAT_NEGATIVE_INF = 2;
private static final Charset TEXT_ENCODE = StandardCharsets.UTF_8;
private static final int INITIAL_BUFFER_SIZE = 2048;
private final Reader reader;
private final CharsetEncoder encoder;
private int lastSeparator;
private int lookAhead;
private char[] charBuffer;
private CharBuffer wrappedCharBuffer;
private final ByteBuffer encodeBuffer;
/**
* Creates a new instance.
* @param reader the source reader
* @throws IOException if failed to initialize the parser
* @throws IllegalArgumentException if the parameter is {@code null}
*/
public TsvParser(Reader reader) throws IOException {
if (reader == null) {
throw new IllegalArgumentException("reader must not be null"); //$NON-NLS-1$
}
if (reader instanceof BufferedReader) {
this.reader = reader;
} else {
this.reader = new BufferedReader(reader);
}
this.encoder = TEXT_ENCODE
.newEncoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
this.charBuffer = new char[INITIAL_BUFFER_SIZE];
this.lastSeparator = RECORD_SEPARATOR;
this.encodeBuffer = ByteBuffer.allocate(INITIAL_BUFFER_SIZE);
fillLookAhead();
}
/**
* Consumes a character and put it into the look-ahead buffer.
* In the current implementation, the look-ahead buffer has always the head of the stream
* after each method invocation.
* @throws IOException if failed to read the next character
*/
private void fillLookAhead() throws IOException {
this.lookAhead = reader.read();
}
// MEMO: keep "throws IOException" for forward compatibility
@Override
public boolean next() throws RecordFormatException, IOException {
lastSeparator = CELL_SEPARATOR;
return lookAhead != -1;
}
/**
* Validates the current reading position is the head of a cell.
* @throws RecordFormatException if violated
*/
private void checkCellStart() throws RecordFormatException {
if (lastSeparator != CELL_SEPARATOR || lookAhead == -1) {
throw new RecordFormatException("Next cell is not started");
}
}
@Override
public void fill(BooleanOption option) throws RecordFormatException, IOException {
checkCellStart();
if (applyNull(option)) {
return;
}
assertHasRest(option, lookAhead);
if (lookAhead == BOOLEAN_TRUE) {
option.modify(true);
} else if (lookAhead == BOOLEAN_FALSE) {
option.modify(false);
} else {
throw new RecordFormatException(MessageFormat.format(
"Invalid character {0} for boolean",
(char) lookAhead));
}
int next = reader.read();
if (isSeparator(next) == false) {
throw new RecordFormatException(MessageFormat.format(
"Invalid character {0} for boolean",
(char) next));
}
setLastSeparator(next);
fillLookAhead();
}
@Override
public void fill(ByteOption option) throws RecordFormatException, IOException {
checkCellStart();
if (applyNull(option)) {
return;
}
option.modify((byte) readInt(option));
fillLookAhead();
}
@Override
public void fill(ShortOption option) throws RecordFormatException, IOException {
checkCellStart();
if (applyNull(option)) {
return;
}
option.modify((short) readInt(option));
fillLookAhead();
}
@Override
public void fill(IntOption option) throws RecordFormatException, IOException {
checkCellStart();
if (applyNull(option)) {
return;
}
int value = readInt(option);
option.modify(value);
fillLookAhead();
}
@Override
public void fill(LongOption option) throws RecordFormatException, IOException {
checkCellStart();
if (applyNull(option)) {
return;
}
boolean negative = false;
if (lookAhead == '-') {
lookAhead = reader.read();
negative = true;
}
assertHasRest(option, lookAhead);
long value = toNumber(lookAhead);
while (true) {
int c = reader.read();
if (isSeparator(c)) {
setLastSeparator(c);
break;
}
value = value * 10L + toNumber(c);
}
if (negative) {
value = -value;
}
option.modify(value);
fillLookAhead();
}
@Override
public void fill(FloatOption option) throws RecordFormatException, IOException {
checkCellStart();
if (applyNull(option)) {
return;
}
assertHasRest(option, lookAhead);
charBuffer[0] = (char) lookAhead;
int length = readString(1, option);
String string = new String(charBuffer, 0, length + 1);
try {
option.modify(Float.parseFloat(string));
} catch (NumberFormatException e) {
Matcher matcher = SPECIAL_FLOAT.matcher(string);
if (matcher.matches()) {
if (matcher.group(SPECIAL_FLOAT_POSITIVE_INF) != null) {
option.modify(Float.POSITIVE_INFINITY);
} else if (matcher.group(SPECIAL_FLOAT_NEGATIVE_INF) != null) {
option.modify(Float.NEGATIVE_INFINITY);
} else {
option.modify(Float.NaN);
}
} else {
throw new RecordFormatException(MessageFormat.format(
"Invalid character in floating-point context {0}",
string), e);
}
}
fillLookAhead();
}
@Override
public void fill(DoubleOption option) throws RecordFormatException, IOException {
checkCellStart();
if (applyNull(option)) {
return;
}
assertHasRest(option, lookAhead);
charBuffer[0] = (char) lookAhead;
int length = readString(1, option);
String string = new String(charBuffer, 0, length + 1);
try {
option.modify(Double.parseDouble(string));
} catch (NumberFormatException e) {
Matcher matcher = SPECIAL_FLOAT.matcher(string);
if (matcher.matches()) {
if (matcher.group(SPECIAL_FLOAT_POSITIVE_INF) != null) {
option.modify(Double.POSITIVE_INFINITY);
} else if (matcher.group(SPECIAL_FLOAT_NEGATIVE_INF) != null) {
option.modify(Double.NEGATIVE_INFINITY);
} else {
option.modify(Double.NaN);
}
} else {
throw new RecordFormatException(MessageFormat.format(
"Invalid character in floating-point context {0}",
string), e);
}
}
fillLookAhead();
}
@Override
public void fill(DecimalOption option) throws RecordFormatException, IOException {
checkCellStart();
if (applyNull(option)) {
return;
}
assertHasRest(option, lookAhead);
charBuffer[0] = (char) lookAhead;
int length = readString(1, option);
option.modify(new BigDecimal(charBuffer, 0, length + 1));
fillLookAhead();
}
@Override
public void fill(StringOption option) throws RecordFormatException, IOException {
checkCellStart();
// initializes charBuffer for writing
if (wrappedCharBuffer == null) {
wrappedCharBuffer = CharBuffer.wrap(charBuffer);
} else {
wrappedCharBuffer.clear();
}
// resets Text for writing
option.reset();
if (lookAhead == ESCAPE_CHAR) {
int c = reader.read();
if (c == ESCAPE_NULL_COLUMN) {
option.setNull();
int next = reader.read();
if (isSeparator(next) == false) {
throw new RecordFormatException(MessageFormat.format(
"Missing separator for {0}",
option.getClass().getSimpleName()));
}
setLastSeparator(next);
fillLookAhead();
return;
}
wrappedCharBuffer.append(unescape(c));
} else if (isSeparator(lookAhead)) {
setLastSeparator(lookAhead);
fillLookAhead();
return;
} else {
wrappedCharBuffer.append((char) lookAhead);
}
while (true) {
int c = reader.read();
if (isSeparator(c)) {
setLastSeparator(c);
break;
} else if (c == ESCAPE_CHAR) {
int trailing = reader.read();
wrappedCharBuffer.append(unescape(trailing));
} else {
wrappedCharBuffer.append((char) c);
}
// flush buffer before limit
if (wrappedCharBuffer.position() == wrappedCharBuffer.limit()) {
wrappedCharBuffer.flip();
append(wrappedCharBuffer, option);
wrappedCharBuffer.clear();
}
}
// flush rest contents in buffer
wrappedCharBuffer.flip();
append(wrappedCharBuffer, option);
wrappedCharBuffer.clear();
fillLookAhead();
}
@Override
public void fill(DateOption option) throws RecordFormatException, IOException {
checkCellStart();
if (applyNull(option)) {
return;
}
int year = toNumber(lookAhead) * 1000 + readNumbers(YEAR_FIELD_LENGTH - 1, option);
consume(DATE_FIELD_SEPARATOR);
int month = readNumbers(MONTH_FIELD_LENGTH, option);
consume(DATE_FIELD_SEPARATOR);
int day = readNumbers(DATE_FIELD_LENGTH, option);
int last = reader.read();
if (isSeparator(last) == false) {
throw new RecordFormatException(MessageFormat.format(
"Missing separator for {0}",
option.getClass().getSimpleName()));
}
setLastSeparator(last);
if (year == 0 || month == 0 || day == 0) {
option.setNull();
} else {
option.modify(DateUtil.getDayFromDate(year, month, day));
}
fillLookAhead();
}
@Override
public void fill(DateTimeOption option) throws RecordFormatException, IOException {
checkCellStart();
if (applyNull(option)) {
return;
}
int year = toNumber(lookAhead) * 1000 + readNumbers(YEAR_FIELD_LENGTH - 1, option);
consume(DATE_FIELD_SEPARATOR);
int month = readNumbers(MONTH_FIELD_LENGTH, option);
consume(DATE_FIELD_SEPARATOR);
int day = readNumbers(DATE_FIELD_LENGTH, option);
consume(DATE_TIME_SEPARATOR);
int hour = readNumbers(HOUR_FIELD_LENGTH, option);
consume(TIME_FIELD_SEPARATOR);
int minute = readNumbers(MINUTE_FIELD_LENGTH, option);
consume(TIME_FIELD_SEPARATOR);
int second = readNumbers(SECOND_FIELD_LENGTH, option);
int last = reader.read();
if (isSeparator(last) == false) {
throw new RecordFormatException(MessageFormat.format(
"Missing separator for {0}",
option.getClass().getSimpleName()));
}
setLastSeparator(last);
if (year == 0 || month == 0 || day == 0) {
option.setNull();
} else {
long result = DateUtil.getDayFromDate(year, month, day);
result *= 24L * 60L * 60L;
result += DateUtil.getSecondFromTime(hour, minute, second);
option.modify(result);
}
fillLookAhead();
}
private int readNumbers(int columns, ValueOption<?> option) throws IOException {
int total = 0;
for (int i = 0; i < columns; i++) {
int c = reader.read();
total = total * 10 + toNumber(c);
}
return total;
}
private void consume(char expect) throws IOException {
int c = reader.read();
if (c != expect) {
throw new RecordFormatException(MessageFormat.format(
"Invalid character in expected ''{0}'' but was \"{1}\"",
expect,
String.format("\\u%04x", c))); //$NON-NLS-1$
}
}
private int toNumber(int c) throws RecordFormatException {
if ('0' <= c && c <= '9') {
return c - '0';
}
throw new RecordFormatException(MessageFormat.format(
"Invalid character in number context {0}",
String.format("\\u%04x", c))); //$NON-NLS-1$
}
private void append(CharBuffer source, StringOption target) throws RecordFormatException {
if (source.hasRemaining() == false) {
return;
}
Text text = target.get();
encoder.reset();
encodeBuffer.clear();
while (true) {
CoderResult result = encoder.encode(source, encodeBuffer, true);
if (result.isError()) {
throw new RecordFormatException(MessageFormat.format(
"Cannot process a character string (\"{0}\")",
result));
}
if (result.isUnderflow()) {
consumeEncoded(text);
break;
}
if (result.isOverflow()) {
consumeEncoded(text);
}
}
while (true) {
CoderResult result = encoder.flush(encodeBuffer);
if (result.isError()) {
throw new RecordFormatException(MessageFormat.format(
"Cannot process a character string (\"{0}\")",
result));
}
if (result.isUnderflow()) {
consumeEncoded(text);
break;
}
if (result.isOverflow()) {
consumeEncoded(text);
}
}
}
private void consumeEncoded(Text text) {
encodeBuffer.flip();
if (encodeBuffer.hasRemaining()) {
text.append(
encodeBuffer.array(),
encodeBuffer.position(),
encodeBuffer.limit());
}
encodeBuffer.clear();
}
private char unescape(int c) throws RecordFormatException {
if (c == ESCAPE_CHAR) {
return ESCAPE_CHAR;
}
if (c == ESCAPE_HT) {
return '\t';
}
if (c == ESCAPE_LF) {
return '\n';
}
throw new RecordFormatException(MessageFormat.format(
"Unknown escape character \\{0} ({1}) for StringOption",
(char) c,
String.format("U%04x", c))); //$NON-NLS-1$
}
/**
* Consumes the next integer.
* @param option the target object (this method never changes it)
* @return the next integer
* @throws RecordFormatException if the format is wrong
* @throws IOException if failed by I/O error
*/
private int readInt(ValueOption<?> option) throws IOException, RecordFormatException {
boolean negative = false;
if (lookAhead == '-') {
lookAhead = reader.read();
negative = true;
}
assertHasRest(option, lookAhead);
int value = toNumber(lookAhead);
while (true) {
int c = reader.read();
if (isSeparator(c)) {
setLastSeparator(c);
break;
}
value = value * 10 + toNumber(c);
}
if (negative) {
value = -value;
}
return value;
}
private void setLastSeparator(int c) {
lastSeparator = c;
}
/**
* Consumes the next string and put it into {@link #charBuffer}.
* @param start the starting offset index of {@link #charBuffer}
* @param option the target object (this method never changes it)
* @return the number of characters to be consumed
* @throws IOException if failed by I/O error
*/
private int readString(int start, ValueOption<?> option) throws IOException {
int current = start;
while (true) {
char[] cbuf = charBuffer;
for (int i = current, n = cbuf.length; i < n; i++) {
int c = reader.read();
if (isSeparator(c)) {
setLastSeparator(c);
return i - start;
}
cbuf[i] = (char) c;
}
current = cbuf.length;
expandCharBuffer();
}
}
private void expandCharBuffer() {
char[] newBuffer = new char[charBuffer.length * 2];
System.arraycopy(charBuffer, 0, newBuffer, 0, charBuffer.length);
charBuffer = newBuffer;
wrappedCharBuffer = null;
}
private static boolean isSeparator(int c) {
return c == -1
|| c == CELL_SEPARATOR
|| c == RECORD_SEPARATOR;
}
private void assertHasRest(ValueOption<?> option, int c) throws RecordFormatException {
if (isSeparator(c)) {
throw new RecordFormatException(MessageFormat.format(
"Empty value for {0}",
option.getClass().getSimpleName()));
}
}
/**
* Consumes {@code null} to the target object only if the next cell actually represents {@code null}.
* @param option the target object
* @return {@code true} if actually consumed the next cell, or otherwise {@code false}
* @throws RecordFormatException if the record format is something wrong
* @throws IOException if failed by I/O error
*/
private boolean applyNull(ValueOption<?> option) throws RecordFormatException, IOException {
if (lookAhead != ESCAPE_CHAR) {
return false;
}
int c = reader.read();
if (c == ESCAPE_NULL_COLUMN) {
option.setNull();
int next = reader.read();
if (isSeparator(next) == false) {
throw new RecordFormatException(MessageFormat.format(
"Missing separator for {0}",
option.getClass().getSimpleName()));
}
setLastSeparator(next);
fillLookAhead();
return true;
} else {
throw new RecordFormatException(MessageFormat.format(
"Cannot recognize \"{1}\" for {0}",
option.getClass().getSimpleName(),
new StringBuilder().append(ESCAPE_CHAR).append(ESCAPE_NULL_COLUMN)));
}
}
@Override
public void endRecord() throws RecordFormatException, IOException {
if (lastSeparator != RECORD_SEPARATOR) {
throw new RecordFormatException("RECORD_SEPARATOR does not appeared");
}
}
@Override
public void close() throws IOException {
reader.close();
}
}