/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.api.common.io; import java.io.IOException; import java.io.Serializable; import org.apache.flink.annotation.Public; import org.apache.flink.api.common.io.statistics.BaseStatistics; import org.apache.flink.configuration.Configuration; import org.apache.flink.core.io.InputSplit; import org.apache.flink.core.io.InputSplitAssigner; import org.apache.flink.core.io.InputSplitSource; /** * The base interface for data sources that produces records. * <p> * The input format handles the following: * <ul> * <li>It describes how the input is split into splits that can be processed in parallel.</li> * <li>It describes how to read records from the input split.</li> * <li>It describes how to gather basic statistics from the input.</li> * </ul> * <p> * The life cycle of an input format is the following: * <ol> * <li>After being instantiated (parameterless), it is configured with a {@link Configuration} object. * Basic fields are read from the configuration, such as for example a file path, if the format describes * files as input.</li> * <li>Optionally: It is called by the compiler to produce basic statistics about the input.</li> * <li>It is called to create the input splits.</li> * <li>Each parallel input task creates an instance, configures it and opens it for a specific split.</li> * <li>All records are read from the input</li> * <li>The input format is closed</li> * </ol> * <p> * IMPORTANT NOTE: Input formats must be written such that an instance can be opened again after it was closed. That * is due to the fact that the input format is used for potentially multiple splits. After a split is done, the * format's close function is invoked and, if another split is available, the open function is invoked afterwards for * the next split. * * @see InputSplit * @see BaseStatistics * * @param <OT> The type of the produced records. * @param <T> The type of input split. */ @Public public interface InputFormat<OT, T extends InputSplit> extends InputSplitSource<T>, Serializable { /** * Configures this input format. Since input formats are instantiated generically and hence parameterless, * this method is the place where the input formats set their basic fields based on configuration values. * <p> * This method is always called first on a newly instantiated input format. * * @param parameters The configuration with all parameters (note: not the Flink config but the TaskConfig). */ void configure(Configuration parameters); /** * Gets the basic statistics from the input described by this format. If the input format does not know how * to create those statistics, it may return null. * This method optionally gets a cached version of the statistics. The input format may examine them and decide * whether it directly returns them without spending effort to re-gather the statistics. * <p> * When this method is called, the input format it guaranteed to be configured. * * @param cachedStatistics The statistics that were cached. May be null. * @return The base statistics for the input, or null, if not available. */ BaseStatistics getStatistics(BaseStatistics cachedStatistics) throws IOException; /** * Creates the different splits of the input that can be processed in parallel. * <p> * When this method is called, the input format it guaranteed to be configured. * * @param minNumSplits The minimum desired number of splits. If fewer are created, some parallel * instances may remain idle. * @return The splits of this input that can be processed in parallel. * * @throws IOException Thrown, when the creation of the splits was erroneous. */ @Override T[] createInputSplits(int minNumSplits) throws IOException; /** * Gets the type of the input splits that are processed by this input format. * * @return The type of the input splits. */ @Override InputSplitAssigner getInputSplitAssigner(T[] inputSplits); // -------------------------------------------------------------------------------------------- /** * Opens a parallel instance of the input format to work on a split. * <p> * When this method is called, the input format it guaranteed to be configured. * * @param split The split to be opened. * @throws IOException Thrown, if the spit could not be opened due to an I/O problem. */ void open(T split) throws IOException; /** * Method used to check if the end of the input is reached. * <p> * When this method is called, the input format it guaranteed to be opened. * * @return True if the end is reached, otherwise false. * @throws IOException Thrown, if an I/O error occurred. */ boolean reachedEnd() throws IOException; /** * Reads the next record from the input. * <p> * When this method is called, the input format it guaranteed to be opened. * * @param reuse Object that may be reused. * @return Read record. * * @throws IOException Thrown, if an I/O error occurred. */ OT nextRecord(OT reuse) throws IOException; /** * Method that marks the end of the life-cycle of an input split. Should be used to close channels and streams * and release resources. After this method returns without an error, the input is assumed to be correctly read. * <p> * When this method is called, the input format it guaranteed to be opened. * * @throws IOException Thrown, if the input could not be closed properly. */ void close() throws IOException; }