/**
* Copyright 2011-2017 Asakusa Framework Team.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.asakusafw.runtime.io.line;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.text.MessageFormat;
import org.apache.hadoop.io.Text;
import com.asakusafw.runtime.value.StringOption;
/**
* A simple line reader for text with any charset encoding.
* @since 0.7.5
*/
public class BasicLineInput extends LineInput {
private static final Text EMPTY = new Text();
private static final int MIN_BUFFER_SIZE = 256;
private final Reader reader;
private final String path;
private final CharsetEncoder encoder;
private final CharBuffer charBuffer;
private final ByteBuffer byteBuffer;
private boolean sawCr;
private boolean sawEof;
private long lineNumber;
/**
* Creates a new instance.
* @param stream the source stream
* @param path the source path
* @param configuration the current configuration
* @throws IllegalArgumentException if some parameters were {@code null}
* @see #newInstance(InputStream, String, LineConfiguration)
*/
public BasicLineInput(InputStream stream, String path, LineConfiguration configuration) {
if (stream == null) {
throw new IllegalArgumentException("stream must not be null"); //$NON-NLS-1$
}
if (configuration == null) {
throw new IllegalArgumentException("configuration must not be null"); //$NON-NLS-1$
}
this.reader = new InputStreamReader(stream, configuration.getCharset());
this.path = path;
this.encoder = INTERNAL_CHARSET.newEncoder()
.onMalformedInput(configuration.getMalformedInputAction())
.onUnmappableCharacter(configuration.getUnmappableCharacterAction());
this.charBuffer = CharBuffer.wrap(new char[Math.max(MIN_BUFFER_SIZE, configuration.getBufferSize())]);
this.charBuffer.clear();
this.charBuffer.flip();
this.byteBuffer = ByteBuffer.wrap(new byte[Math.max(MIN_BUFFER_SIZE, configuration.getBufferSize()) / 2]);
this.sawCr = false;
this.sawEof = false;
this.lineNumber = 0;
}
@Override
public String getPath() {
return path;
}
@Override
public long getLineNumber() {
if (lineNumber <= 0) {
return -1;
}
return lineNumber;
}
@Override
public boolean readTo(StringOption model) throws IOException {
if (sawEof) {
return false;
}
boolean changed = false;
Text entity = null;
while (true) {
if (prepare() == false) {
if (changed) {
lineNumber++;
return true;
}
return false;
}
if (entity == null) {
entity = reset(model);
}
assert entity != null;
State state = appendBufferTo(entity);
switch (state) {
case NOTHING:
continue;
case CONTINUE:
changed = true;
continue;
case LINE_BREAK:
lineNumber++;
return true;
default:
throw new AssertionError();
}
}
}
private boolean prepare() throws IOException {
CharBuffer b = charBuffer;
if (b.hasRemaining()) {
// already prepared
return true;
}
b.clear();
while (true) {
int read = reader.read(b);
if (read < 0) {
sawEof = true;
return false;
} else if (read > 0) {
b.flip();
break;
}
}
return true;
}
@SuppressWarnings("deprecation")
private Text reset(StringOption model) {
if (model.isNull()) {
model.modify(EMPTY);
} else {
model.get().clear();
}
return model.get();
}
private State appendBufferTo(Text entity) throws IOException {
CharBuffer b = charBuffer;
assert b.hasRemaining();
char[] cs = charBuffer.array();
// skip LF after CR
if (sawCr && cs[b.position()] == '\n') {
b.position(b.position() + 1);
}
boolean eol = false;
int len = 0;
// scan buffer until CR/LF/buffer limit
for (int i = b.position(), n = b.limit(); i < n; i++) {
char c = cs[i];
if (c == '\r' || c == '\n') {
eol = true;
sawCr = c == '\r';
break;
} else {
len++;
}
}
if (len > 0) {
append(entity, len);
}
// skip next LF
if (eol) {
b.position(b.position() + 1);
}
if (len == 0) {
return eol ? State.LINE_BREAK : State.NOTHING;
} else {
return eol ? State.LINE_BREAK : State.CONTINUE;
}
}
private void append(Text entity, int len) throws IOException {
ByteBuffer bs = byteBuffer;
CharBuffer cs = charBuffer;
int limit = cs.limit();
// slice the buffer
cs.limit(cs.position() + len);
while (true) {
bs.clear();
CoderResult result = encoder.encode(cs, bs, true);
if (result.isError() == false) {
bs.flip();
entity.append(bs.array(), bs.position(), bs.limit());
if (result.isUnderflow()) {
break;
}
} else {
assert result.isError();
try {
result.throwException();
} catch (CharacterCodingException e) {
throw new IOException(MessageFormat.format(
"exception occurred while encoding text: {0}",
path), e);
}
}
}
cs.limit(limit);
}
@Override
public void close() throws IOException {
reader.close();
}
private enum State {
NOTHING,
CONTINUE,
LINE_BREAK,
}
}