/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.api.common.io; import org.apache.flink.annotation.Public; import org.apache.flink.annotation.PublicEvolving; import org.apache.flink.api.common.io.statistics.BaseStatistics; import org.apache.flink.configuration.ConfigConstants; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.GlobalConfiguration; import org.apache.flink.core.fs.FileInputSplit; import org.apache.flink.core.fs.FileStatus; import org.apache.flink.core.fs.FileSystem; import org.apache.flink.core.fs.Path; import org.apache.flink.types.parser.FieldParser; import org.apache.flink.util.Preconditions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; /** * Base implementation for input formats that split the input at a delimiter into records. * The parsing of the record bytes into the record has to be implemented in the * {@link #readRecord(Object, byte[], int, int)} method. * * <p>The default delimiter is the newline character {@code '\n'}.</p> */ @Public public abstract class DelimitedInputFormat<OT> extends FileInputFormat<OT> implements CheckpointableInputFormat<FileInputSplit, Long> { private static final long serialVersionUID = 1L; // -------------------------------------- Constants ------------------------------------------- /** * The log. */ private static final Logger LOG = LoggerFactory.getLogger(DelimitedInputFormat.class); // The charset used to convert strings to bytes private String charsetName = "UTF-8"; // Charset is not serializable private transient Charset charset; /** * The default read buffer size = 1MB. */ private static final int DEFAULT_READ_BUFFER_SIZE = 1024 * 1024; /** * Indication that the number of samples has not been set by the configuration. */ private static final int NUM_SAMPLES_UNDEFINED = -1; /** * The maximum number of line samples to be taken. */ private static int DEFAULT_MAX_NUM_SAMPLES; /** * The minimum number of line samples to be taken. */ private static int DEFAULT_MIN_NUM_SAMPLES; /** * The maximum size of a sample record before sampling is aborted. To catch cases where a wrong delimiter is given. */ private static int MAX_SAMPLE_LEN; /** * @deprecated Please use {@code loadConfigParameters(Configuration config} */ @Deprecated protected static void loadGlobalConfigParams() { loadConfigParameters(GlobalConfiguration.loadConfiguration()); } protected static void loadConfigParameters(Configuration parameters) { int maxSamples = parameters.getInteger(ConfigConstants.DELIMITED_FORMAT_MAX_LINE_SAMPLES_KEY, ConfigConstants.DEFAULT_DELIMITED_FORMAT_MAX_LINE_SAMPLES); int minSamples = parameters.getInteger(ConfigConstants.DELIMITED_FORMAT_MIN_LINE_SAMPLES_KEY, ConfigConstants.DEFAULT_DELIMITED_FORMAT_MIN_LINE_SAMPLES); if (maxSamples < 0) { LOG.error("Invalid default maximum number of line samples: " + maxSamples + ". Using default value of " + ConfigConstants.DEFAULT_DELIMITED_FORMAT_MAX_LINE_SAMPLES); maxSamples = ConfigConstants.DEFAULT_DELIMITED_FORMAT_MAX_LINE_SAMPLES; } if (minSamples < 0) { LOG.error("Invalid default minimum number of line samples: " + minSamples + ". Using default value of " + ConfigConstants.DEFAULT_DELIMITED_FORMAT_MIN_LINE_SAMPLES); minSamples = ConfigConstants.DEFAULT_DELIMITED_FORMAT_MIN_LINE_SAMPLES; } DEFAULT_MAX_NUM_SAMPLES = maxSamples; if (minSamples > maxSamples) { LOG.error("Defaul minimum number of line samples cannot be greater the default maximum number " + "of line samples: min=" + minSamples + ", max=" + maxSamples + ". Defaulting minumum to maximum."); DEFAULT_MIN_NUM_SAMPLES = maxSamples; } else { DEFAULT_MIN_NUM_SAMPLES = minSamples; } int maxLen = parameters.getInteger(ConfigConstants.DELIMITED_FORMAT_MAX_SAMPLE_LENGTH_KEY, ConfigConstants.DEFAULT_DELIMITED_FORMAT_MAX_SAMPLE_LEN); if (maxLen <= 0) { maxLen = ConfigConstants.DEFAULT_DELIMITED_FORMAT_MAX_SAMPLE_LEN; LOG.error("Invalid value for the maximum sample record length. Using defailt value of " + maxLen + '.'); } else if (maxLen < DEFAULT_READ_BUFFER_SIZE) { maxLen = DEFAULT_READ_BUFFER_SIZE; LOG.warn("Increasing maximum sample record length to size of the read buffer (" + maxLen + ")."); } MAX_SAMPLE_LEN = maxLen; } // -------------------------------------------------------------------------------------------- // Variables for internal parsing. // They are all transient, because we do not want them so be serialized // -------------------------------------------------------------------------------------------- private transient byte[] readBuffer; private transient byte[] wrapBuffer; private transient int readPos; private transient int limit; private transient byte[] currBuffer; // buffer in which current record byte sequence is found private transient int currOffset; // offset in above buffer private transient int currLen; // length of current byte sequence private transient boolean overLimit; private transient boolean end; private long offset = -1; // -------------------------------------------------------------------------------------------- // The configuration parameters. Configured on the instance and serialized to be shipped. // -------------------------------------------------------------------------------------------- // The delimiter may be set with a byte-sequence or a String. In the latter // case the byte representation is updated consistent with current charset. private byte[] delimiter = new byte[] {'\n'}; private String delimiterString = null; private int lineLengthLimit = Integer.MAX_VALUE; private int bufferSize = -1; private int numLineSamples = NUM_SAMPLES_UNDEFINED; // -------------------------------------------------------------------------------------------- // Constructors & Getters/setters for the configurable parameters // -------------------------------------------------------------------------------------------- public DelimitedInputFormat() { this(null, null); } protected DelimitedInputFormat(Path filePath, Configuration configuration) { super(filePath); if (configuration == null) { configuration = GlobalConfiguration.loadConfiguration(); } loadConfigParameters(configuration); } /** * Get the character set used for the row delimiter. This is also used by * subclasses to interpret field delimiters, comment strings, and for * configuring {@link FieldParser}s. * * @return the charset */ @PublicEvolving public Charset getCharset() { if (this.charset == null) { this.charset = Charset.forName(charsetName); } return this.charset; } /** * Set the name of the character set used for the row delimiter. This is * also used by subclasses to interpret field delimiters, comment strings, * and for configuring {@link FieldParser}s. * * These fields are interpreted when set. Changing the charset thereafter * may cause unexpected results. * * @param charset name of the charset */ @PublicEvolving public void setCharset(String charset) { this.charsetName = Preconditions.checkNotNull(charset); this.charset = null; if (this.delimiterString != null) { this.delimiter = delimiterString.getBytes(getCharset()); } } public byte[] getDelimiter() { return delimiter; } public void setDelimiter(byte[] delimiter) { if (delimiter == null) { throw new IllegalArgumentException("Delimiter must not be null"); } this.delimiter = delimiter; this.delimiterString = null; } public void setDelimiter(char delimiter) { setDelimiter(String.valueOf(delimiter)); } public void setDelimiter(String delimiter) { if (delimiter == null) { throw new IllegalArgumentException("Delimiter must not be null"); } this.delimiter = delimiter.getBytes(getCharset()); this.delimiterString = delimiter; } public int getLineLengthLimit() { return lineLengthLimit; } public void setLineLengthLimit(int lineLengthLimit) { if (lineLengthLimit < 1) { throw new IllegalArgumentException("Line length limit must be at least 1."); } this.lineLengthLimit = lineLengthLimit; } public int getBufferSize() { return bufferSize; } public void setBufferSize(int bufferSize) { if (bufferSize < 1) { throw new IllegalArgumentException("Buffer size must be at least 1."); } this.bufferSize = bufferSize; } public int getNumLineSamples() { return numLineSamples; } public void setNumLineSamples(int numLineSamples) { if (numLineSamples < 0) { throw new IllegalArgumentException("Number of line samples must not be negative."); } this.numLineSamples = numLineSamples; } // -------------------------------------------------------------------------------------------- // User-defined behavior // -------------------------------------------------------------------------------------------- /** * This function parses the given byte array which represents a serialized record. * The function returns a valid record or throws an IOException. * * @param reuse An optionally reusable object. * @param bytes Binary data of serialized records. * @param offset The offset where to start to read the record data. * @param numBytes The number of bytes that can be read starting at the offset position. * * @return Returns the read record if it was successfully deserialized. * @throws IOException if the record could not be read. */ public abstract OT readRecord(OT reuse, byte[] bytes, int offset, int numBytes) throws IOException; // -------------------------------------------------------------------------------------------- // Pre-flight: Configuration, Splits, Sampling // -------------------------------------------------------------------------------------------- /** * Configures this input format by reading the path to the file from the configuration and the string that * defines the record delimiter. * * @param parameters The configuration object to read the parameters from. */ @Override public void configure(Configuration parameters) { super.configure(parameters); // the if() clauses are to prevent the configure() method from // overwriting the values set by the setters if (Arrays.equals(delimiter, new byte[] {'\n'})) { String delimString = parameters.getString(RECORD_DELIMITER, null); if (delimString != null) { setDelimiter(delimString); } } // set the number of samples if (numLineSamples == NUM_SAMPLES_UNDEFINED) { String samplesString = parameters.getString(NUM_STATISTICS_SAMPLES, null); if (samplesString != null) { try { setNumLineSamples(Integer.parseInt(samplesString)); } catch (NumberFormatException e) { if (LOG.isWarnEnabled()) { LOG.warn("Invalid value for number of samples to take: " + samplesString + ". Skipping sampling."); } setNumLineSamples(0); } } } } @Override public FileBaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException { final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null; // store properties final long oldTimeout = this.openTimeout; final int oldBufferSize = this.bufferSize; final int oldLineLengthLimit = this.lineLengthLimit; try { final Path filePath = this.filePath; // get the filesystem final FileSystem fs = FileSystem.get(filePath.toUri()); final ArrayList<FileStatus> allFiles = new ArrayList<FileStatus>(1); // let the file input format deal with the up-to-date check and the basic size final FileBaseStatistics stats = getFileStats(cachedFileStats, filePath, fs, allFiles); if (stats == null) { return null; } // check whether the width per record is already known or the total size is unknown as well // in both cases, we return the stats as they are if (stats.getAverageRecordWidth() != FileBaseStatistics.AVG_RECORD_BYTES_UNKNOWN || stats.getTotalInputSize() == FileBaseStatistics.SIZE_UNKNOWN) { return stats; } // disabling sampling for unsplittable files since the logic below assumes splitability. // TODO: Add sampling for unsplittable files. Right now, only compressed text files are affected by this limitation. if(unsplittable) { return stats; } // compute how many samples to take, depending on the defined upper and lower bound final int numSamples; if (this.numLineSamples != NUM_SAMPLES_UNDEFINED) { numSamples = this.numLineSamples; } else { // make the samples small for very small files final int calcSamples = (int) (stats.getTotalInputSize() / 1024); numSamples = Math.min(DEFAULT_MAX_NUM_SAMPLES, Math.max(DEFAULT_MIN_NUM_SAMPLES, calcSamples)); } // check if sampling is disabled. if (numSamples == 0) { return stats; } if (numSamples < 0) { throw new RuntimeException("Error: Invalid number of samples: " + numSamples); } // make sure that the sampling times out after a while if the file system does not answer in time this.openTimeout = 10000; // set a small read buffer size this.bufferSize = 4 * 1024; // prevent overly large records, for example if we have an incorrectly configured delimiter this.lineLengthLimit = MAX_SAMPLE_LEN; long offset = 0; long totalNumBytes = 0; long stepSize = stats.getTotalInputSize() / numSamples; int fileNum = 0; int samplesTaken = 0; // take the samples while (samplesTaken < numSamples && fileNum < allFiles.size()) { // make a split for the sample and use it to read a record FileStatus file = allFiles.get(fileNum); FileInputSplit split = new FileInputSplit(0, file.getPath(), offset, file.getLen() - offset, null); // we open the split, read one line, and take its length try { open(split); if (readLine()) { totalNumBytes += this.currLen + this.delimiter.length; samplesTaken++; } } finally { // close the file stream, do not release the buffers super.close(); } offset += stepSize; // skip to the next file, if necessary while (fileNum < allFiles.size() && offset >= (file = allFiles.get(fileNum)).getLen()) { offset -= file.getLen(); fileNum++; } } // we have the width, store it return new FileBaseStatistics(stats.getLastModificationTime(), stats.getTotalInputSize(), totalNumBytes / (float) samplesTaken); } catch (IOException ioex) { if (LOG.isWarnEnabled()) { LOG.warn("Could not determine statistics for file '" + this.filePath + "' due to an io error: " + ioex.getMessage()); } } catch (Throwable t) { if (LOG.isErrorEnabled()) { LOG.error("Unexpected problem while getting the file statistics for file '" + this.filePath + "': " + t.getMessage(), t); } } finally { // restore properties (even on return) this.openTimeout = oldTimeout; this.bufferSize = oldBufferSize; this.lineLengthLimit = oldLineLengthLimit; } // no statistics possible return null; } /** * Opens the given input split. This method opens the input stream to the specified file, allocates read buffers * and positions the stream at the correct position, making sure that any partial record at the beginning is skipped. * * @param split The input split to open. * * @see org.apache.flink.api.common.io.FileInputFormat#open(org.apache.flink.core.fs.FileInputSplit) */ @Override public void open(FileInputSplit split) throws IOException { super.open(split); initBuffers(); this.offset = splitStart; if (this.splitStart != 0) { this.stream.seek(offset); readLine(); // if the first partial record already pushes the stream over // the limit of our split, then no record starts within this split if (this.overLimit) { this.end = true; } } else { fillBuffer(); } } private void initBuffers() { this.bufferSize = this.bufferSize <= 0 ? DEFAULT_READ_BUFFER_SIZE : this.bufferSize; if (this.readBuffer == null || this.readBuffer.length != this.bufferSize) { this.readBuffer = new byte[this.bufferSize]; } if (this.wrapBuffer == null || this.wrapBuffer.length < 256) { this.wrapBuffer = new byte[256]; } this.readPos = 0; this.limit = 0; this.overLimit = false; this.end = false; } /** * Checks whether the current split is at its end. * * @return True, if the split is at its end, false otherwise. */ @Override public boolean reachedEnd() { return this.end; } @Override public OT nextRecord(OT record) throws IOException { if (readLine()) { return readRecord(record, this.currBuffer, this.currOffset, this.currLen); } else { this.end = true; return null; } } /** * Closes the input by releasing all buffers and closing the file input stream. * * @throws IOException Thrown, if the closing of the file stream causes an I/O error. */ @Override public void close() throws IOException { this.wrapBuffer = null; this.readBuffer = null; super.close(); } // -------------------------------------------------------------------------------------------- protected final boolean readLine() throws IOException { if (this.stream == null || this.overLimit) { return false; } int countInWrapBuffer = 0; /* position of matching positions in the delimiter byte array */ int i = 0; while (true) { if (this.readPos >= this.limit) { if (!fillBuffer()) { if (countInWrapBuffer > 0) { this.offset += countInWrapBuffer; setResult(this.wrapBuffer, 0, countInWrapBuffer); return true; } else { return false; } } } int startPos = this.readPos; int count; // Search for next occurence of delimiter in read buffer. while (this.readPos < this.limit && i < this.delimiter.length) { if ((this.readBuffer[this.readPos]) == this.delimiter[i]) { // Found the expected delimiter character. Continue looking for the next character of delimiter. i++; } else { // Delimiter does not match. // We have to reset the read position to the character after the first matching character // and search for the whole delimiter again. readPos -= i; i = 0; } readPos++; } // check why we dropped out if (i == this.delimiter.length) { // delimiter found int totalBytesRead = this.readPos - startPos; this.offset += countInWrapBuffer + totalBytesRead; count = totalBytesRead - this.delimiter.length; // copy to byte array if (countInWrapBuffer > 0) { // check wrap buffer size if (this.wrapBuffer.length < countInWrapBuffer + count) { final byte[] nb = new byte[countInWrapBuffer + count]; System.arraycopy(this.wrapBuffer, 0, nb, 0, countInWrapBuffer); this.wrapBuffer = nb; } if (count >= 0) { System.arraycopy(this.readBuffer, 0, this.wrapBuffer, countInWrapBuffer, count); } setResult(this.wrapBuffer, 0, countInWrapBuffer + count); return true; } else { setResult(this.readBuffer, startPos, count); return true; } } else { count = this.limit - startPos; // check against the maximum record length if (((long) countInWrapBuffer) + count > this.lineLengthLimit) { throw new IOException("The record length exceeded the maximum record length (" + this.lineLengthLimit + ")."); } // buffer exhausted if (this.wrapBuffer.length - countInWrapBuffer < count) { // reallocate byte[] tmp = new byte[Math.max(this.wrapBuffer.length * 2, countInWrapBuffer + count)]; System.arraycopy(this.wrapBuffer, 0, tmp, 0, countInWrapBuffer); this.wrapBuffer = tmp; } System.arraycopy(this.readBuffer, startPos, this.wrapBuffer, countInWrapBuffer, count); countInWrapBuffer += count; } } } private void setResult(byte[] buffer, int offset, int len) { this.currBuffer = buffer; this.currOffset = offset; this.currLen = len; } private boolean fillBuffer() throws IOException { // special case for reading the whole split. if (this.splitLength == FileInputFormat.READ_WHOLE_SPLIT_FLAG) { int read = this.stream.read(this.readBuffer, 0, readBuffer.length); if (read == -1) { this.stream.close(); this.stream = null; return false; } else { this.readPos = 0; this.limit = read; return true; } } // else .. int toRead; if (this.splitLength > 0) { // if we have more data, read that toRead = this.splitLength > this.readBuffer.length ? this.readBuffer.length : (int) this.splitLength; } else { // if we have exhausted our split, we need to complete the current record, or read one // more across the next split. // the reason is that the next split will skip over the beginning until it finds the first // delimiter, discarding it as an incomplete chunk of data that belongs to the last record in the // previous split. toRead = this.readBuffer.length; this.overLimit = true; } int read = this.stream.read(this.readBuffer, 0, toRead); if (read == -1) { this.stream.close(); this.stream = null; return false; } else { this.splitLength -= read; this.readPos = 0; this.limit = read; return true; } } // -------------------------------------------------------------------------------------------- // Config Keys for Parametrization via configuration // -------------------------------------------------------------------------------------------- /** * The configuration key to set the record delimiter. */ protected static final String RECORD_DELIMITER = "delimited-format.delimiter"; /** * The configuration key to set the number of samples to take for the statistics. */ private static final String NUM_STATISTICS_SAMPLES = "delimited-format.numSamples"; // -------------------------------------------------------------------------------------------- // Checkpointing // -------------------------------------------------------------------------------------------- @PublicEvolving @Override public Long getCurrentState() throws IOException { return this.offset; } @PublicEvolving @Override public void reopen(FileInputSplit split, Long state) throws IOException { Preconditions.checkNotNull(split, "reopen() cannot be called on a null split."); Preconditions.checkNotNull(state, "reopen() cannot be called with a null initial state."); Preconditions.checkArgument(state == -1 || state >= split.getStart(), " Illegal offset "+ state +", smaller than the splits start=" + split.getStart()); try { this.open(split); } finally { this.offset = state; } if (state > this.splitStart + split.getLength()) { this.end = true; } else if (state > split.getStart()) { initBuffers(); this.stream.seek(this.offset); if (split.getLength() == -1) { // this is the case for unsplittable files fillBuffer(); } else { this.splitLength = this.splitStart + split.getLength() - this.offset; if (splitLength <= 0) { this.end = true; } } } } }