/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flume.serialization;
import com.google.common.base.Charsets;
import org.apache.flume.annotations.InterfaceAudience;
import org.apache.flume.annotations.InterfaceStability;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.channels.FileChannel;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
/**
* <p>This class makes the following assumptions:</p>
* <ol>
* <li>The underlying file is not changing while it is being read</li>
* </ol>
*
* <p>The ability to {@link #reset()} is dependent on the underlying {@link
* PositionTracker} instance's durability semantics.</p>
*
* <p><strong>A note on surrogate pairs:</strong></p>
*
* <p>The logic for decoding surrogate pairs is as follows:
* If no character has been decoded by a "normal" pass, and the buffer still has remaining bytes,
* then an attempt is made to read 2 characters in one pass.
* If it succeeds, then the first char (high surrogate) is returned;
* the second char (low surrogate) is recorded internally,
* and is returned at the next call to {@link #readChar()}.
* If it fails, then it is assumed that EOF has been reached.</p>
*
* <p>Impacts on position, mark and reset: when a surrogate pair is decoded, the position
* is incremented by the amount of bytes taken to decode the <em>entire</em> pair (usually, 4).
* This is the most reasonable choice since it would not be advisable
* to reset a stream to a position pointing to the second char in a pair of surrogates:
* such a dangling surrogate would not be properly decoded without its counterpart.</p>
*
* <p>Thus the behaviour of mark and reset is as follows:</p>
*
* <ol>
* <li>If {@link #mark()} is called after a high surrogate pair has been returned by
* {@link #readChar()}, the marked position will be that of the character <em>following</em>
* the low surrogate, <em>not</em> that of the low surrogate itself.</li>
* <li>If {@link #reset()} is called after a high surrogate pair has been returned by
* {@link #readChar()}, the low surrogate is always returned by the next call to
* {@link #readChar()}, <em>before</em> the stream is actually reset to the last marked
* position.</li>
* </ol>
*
* <p>This ensures that no dangling high surrogate could ever be read as long as
* the same instance is used to read the whole pair. <strong>However, if {@link #reset()}
* is called after a high surrogate pair has been returned by {@link #readChar()},
* and a new instance of ResettableFileInputStream is used to resume reading,
* then the low surrogate char will be lost,
* resulting in a corrupted sequence of characters (dangling high surrogate).</strong>
* This situation is hopefully extremely unlikely to happen in real life.
* </p>
*/
@InterfaceAudience.Private
@InterfaceStability.Evolving
public class ResettableFileInputStream extends ResettableInputStream
implements RemoteMarkable, LengthMeasurable {
Logger logger = LoggerFactory.getLogger(ResettableFileInputStream.class);
public static final int DEFAULT_BUF_SIZE = 16384;
/**
* The minimum acceptable buffer size to store bytes read
* from the underlying file. A minimum size of 8 ensures that the
* buffer always has enough space to contain multi-byte characters,
* including special sequences such as surrogate pairs, Byte Order Marks, etc.
*/
public static final int MIN_BUF_SIZE = 8;
private final File file;
private final PositionTracker tracker;
private final FileInputStream in;
private final FileChannel chan;
private final ByteBuffer buf;
private final CharBuffer charBuf;
private final byte[] byteBuf;
private final long fileSize;
private final CharsetDecoder decoder;
private long position;
private long syncPosition;
private int maxCharWidth;
/**
* Whether this instance holds a low surrogate character.
*/
private boolean hasLowSurrogate = false;
/**
* A low surrogate character read from a surrogate pair.
* When a surrogate pair is found, the high (first) surrogate pair
* is returned upon a call to {@link #read()},
* while the low (second) surrogate remains stored in memory,
* to be returned at the next call to {@link #read()}.
*/
private char lowSurrogate;
/**
*
* @param file
* File to read
*
* @param tracker
* PositionTracker implementation to make offset position durable
*
* @throws FileNotFoundException If the file to read does not exist
* @throws IOException If the position reported by the tracker cannot be sought
*/
public ResettableFileInputStream(File file, PositionTracker tracker)
throws IOException {
this(file, tracker, DEFAULT_BUF_SIZE, Charsets.UTF_8, DecodeErrorPolicy.FAIL);
}
/**
*
* @param file
* File to read
*
* @param tracker
* PositionTracker implementation to make offset position durable
*
* @param bufSize
* Size of the underlying buffer used for input. If lesser than {@link #MIN_BUF_SIZE},
* a buffer of length {@link #MIN_BUF_SIZE} will be created instead.
*
* @param charset
* Character set used for decoding text, as necessary
*
* @param decodeErrorPolicy
* A {@link DecodeErrorPolicy} instance to determine how
* the decoder should behave in case of malformed input and/or
* unmappable character.
*
* @throws FileNotFoundException If the file to read does not exist
* @throws IOException If the position reported by the tracker cannot be sought
*/
public ResettableFileInputStream(File file, PositionTracker tracker,
int bufSize, Charset charset, DecodeErrorPolicy decodeErrorPolicy)
throws IOException {
this.file = file;
this.tracker = tracker;
this.in = new FileInputStream(file);
this.chan = in.getChannel();
this.buf = ByteBuffer.allocateDirect(Math.max(bufSize, MIN_BUF_SIZE));
buf.flip();
this.byteBuf = new byte[1]; // single byte
this.charBuf = CharBuffer.allocate(2); // two chars for surrogate pairs
charBuf.flip();
this.fileSize = file.length();
this.decoder = charset.newDecoder();
this.position = 0;
this.syncPosition = 0;
if (charset.name().startsWith("UTF-8")) {
// some JDKs wrongly report 3 bytes max
this.maxCharWidth = 4;
} else if (charset.name().startsWith("UTF-16")) {
// UTF_16BE and UTF_16LE wrongly report 2 bytes max
this.maxCharWidth = 4;
} else if (charset.name().startsWith("UTF-32")) {
// UTF_32BE and UTF_32LE wrongly report 4 bytes max
this.maxCharWidth = 8;
} else {
this.maxCharWidth = (int) Math.ceil(charset.newEncoder().maxBytesPerChar());
}
CodingErrorAction errorAction;
switch (decodeErrorPolicy) {
case FAIL:
errorAction = CodingErrorAction.REPORT;
break;
case REPLACE:
errorAction = CodingErrorAction.REPLACE;
break;
case IGNORE:
errorAction = CodingErrorAction.IGNORE;
break;
default:
throw new IllegalArgumentException(
"Unexpected value for decode error policy: " + decodeErrorPolicy);
}
decoder.onMalformedInput(errorAction);
decoder.onUnmappableCharacter(errorAction);
seek(tracker.getPosition());
}
@Override
public synchronized int read() throws IOException {
int len = read(byteBuf, 0, 1);
if (len == -1) {
return -1;
// len == 0 should never happen
} else if (len == 0) {
return -1;
} else {
return byteBuf[0] & 0xFF;
}
}
@Override
public synchronized int read(byte[] b, int off, int len) throws IOException {
logger.trace("read(buf, {}, {})", off, len);
if (position >= fileSize) {
return -1;
}
if (!buf.hasRemaining()) {
refillBuf();
}
int rem = buf.remaining();
if (len > rem) {
len = rem;
}
buf.get(b, off, len);
incrPosition(len, true);
return len;
}
@Override
public synchronized int readChar() throws IOException {
// Check whether we are in the middle of a surrogate pair,
// in which case, return the last (low surrogate) char of the pair.
if (hasLowSurrogate) {
hasLowSurrogate = false;
return lowSurrogate;
}
// The decoder can have issues with multi-byte characters.
// This check ensures that there are at least maxCharWidth bytes in the buffer
// before reaching EOF.
if (buf.remaining() < maxCharWidth) {
buf.clear();
buf.flip();
refillBuf();
}
int start = buf.position();
charBuf.clear();
charBuf.limit(1);
boolean isEndOfInput = false;
if (position >= fileSize) {
isEndOfInput = true;
}
CoderResult res = decoder.decode(buf, charBuf, isEndOfInput);
if (res.isMalformed() || res.isUnmappable()) {
res.throwException();
}
int delta = buf.position() - start;
charBuf.flip();
// Found a single char
if (charBuf.hasRemaining()) {
char c = charBuf.get();
incrPosition(delta, true);
return c;
}
// Found nothing, but the byte buffer has not been entirely consumed.
// This situation denotes the presence of a surrogate pair
// that can only be decoded if we have a 2-char buffer.
if (buf.hasRemaining()) {
charBuf.clear();
// increase the limit to 2
charBuf.limit(2);
// decode 2 chars in one pass
res = decoder.decode(buf, charBuf, isEndOfInput);
if (res.isMalformed() || res.isUnmappable()) {
res.throwException();
}
charBuf.flip();
// Check if we successfully decoded 2 chars
if (charBuf.remaining() == 2) {
char highSurrogate = charBuf.get();
// save second (low surrogate) char for later consumption
lowSurrogate = charBuf.get();
// Check if we really have a surrogate pair
if (!Character.isHighSurrogate(highSurrogate) || !Character.isLowSurrogate(lowSurrogate)) {
// This should only happen in case of bad sequences (dangling surrogate, etc.)
logger.warn("Decoded a pair of chars, but it does not seem to be a surrogate pair: {} {}",
(int)highSurrogate, (int)lowSurrogate);
}
hasLowSurrogate = true;
// consider the pair as a single unit and increment position normally
delta = buf.position() - start;
incrPosition(delta, true);
// return the first (high surrogate) char of the pair
return highSurrogate;
}
}
// end of file
incrPosition(delta, false);
return -1;
}
private void refillBuf() throws IOException {
buf.compact();
chan.position(position); // ensure we read from the proper offset
chan.read(buf);
buf.flip();
}
@Override
public void mark() throws IOException {
tracker.storePosition(tell());
}
@Override
public void markPosition(long position) throws IOException {
tracker.storePosition(position);
}
@Override
public long getMarkPosition() throws IOException {
return tracker.getPosition();
}
@Override
public void reset() throws IOException {
seek(tracker.getPosition());
}
@Override
public long length() throws IOException {
return file.length();
}
@Override
public long tell() throws IOException {
logger.trace("Tell position: {}", syncPosition);
return syncPosition;
}
@Override
public synchronized void seek(long newPos) throws IOException {
logger.trace("Seek to position: {}", newPos);
// check to see if we can seek within our existing buffer
long relativeChange = newPos - position;
if (relativeChange == 0) return; // seek to current pos => no-op
long newBufPos = buf.position() + relativeChange;
if (newBufPos >= 0 && newBufPos < buf.limit()) {
// we can reuse the read buffer
buf.position((int)newBufPos);
} else {
// otherwise, we have to invalidate the read buffer
buf.clear();
buf.flip();
}
// clear decoder state
decoder.reset();
// perform underlying file seek
chan.position(newPos);
// reset position pointers
position = syncPosition = newPos;
}
private void incrPosition(int incr, boolean updateSyncPosition) {
position += incr;
if (updateSyncPosition) {
syncPosition = position;
}
}
@Override
public void close() throws IOException {
tracker.close();
in.close();
}
}