/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.io; import java.io.DataInputStream; import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.serializer.Deserializer; import org.apache.hadoop.io.serializer.Serialization; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.ReflectionUtils; /** * An {@link org.apache.hadoop.mapred.InputFormat} for Plain files with * {@link Deserializer} records. */ @Deprecated public class FlatFileInputFormat<T> extends FileInputFormat<Void, FlatFileInputFormat.RowContainer<T>> { /** * A work-around until HADOOP-1230 is fixed. * * Allows boolean next(k,v) to be called by reference but still allow the * deserializer to create a new object (i.e., row) on every call to next. */ public static class RowContainer<T> { T row; } /** * An implementation of SerializationContext is responsible for looking up the * Serialization implementation for the given RecordReader. Potentially based * on the Configuration or some other mechanism * * The SerializationFactory does not give this functionality since: 1. * Requires Serialization implementations to be specified in the Configuration * a-priori (although same as setting a SerializationContext) 2. Does not * lookup the actual subclass being deserialized. e.g., for Serializable does * not have a way of configuring the actual Java class being * serialized/deserialized. */ public static interface SerializationContext<S> extends Configurable { /** * An {@link Serialization} object for objects of type S. * * @return a serialization object for this context */ Serialization<S> getSerialization() throws IOException; /** * Produces the specific class to deserialize. */ Class<? extends S> getRealClass() throws IOException; } /** * The JobConf keys for the Serialization implementation. */ public static final String SerializationImplKey = "mapred.input.serialization.implKey"; /** * An implementation of {@link SerializationContext} that reads the * Serialization class and specific subclass to be deserialized from the * JobConf. * */ public static class SerializationContextFromConf<S> implements FlatFileInputFormat.SerializationContext<S> { /** * The JobConf keys for the Class that is being deserialized. */ public static final String SerializationSubclassKey = "mapred.input.serialization.subclassKey"; /** * Implements configurable so it can use the configuration to find the right * classes Note: ReflectionUtils will automatigically call setConf with the * right configuration. */ private Configuration conf; @Override public void setConf(Configuration conf) { this.conf = conf; } @Override public Configuration getConf() { return conf; } /** * @return the actual class being deserialized. * @exception does * not currently throw IOException */ @Override public Class<S> getRealClass() throws IOException { return (Class<S>) conf.getClass(SerializationSubclassKey, null, Object.class); } /** * Looks up and instantiates the Serialization Object * * Important to note here that we are not relying on the Hadoop * SerializationFactory part of the Serialization framework. This is because * in the case of Non-Writable Objects, we cannot make any assumptions about * the uniformity of the serialization class APIs - i.e., there may not be a * "write" method call and a subclass may need to implement its own * Serialization classes. The SerializationFactory currently returns the * first (de)serializer that is compatible with the class to be * deserialized; in this context, that assumption isn't necessarily true. * * @return the serialization object for this context * @exception does * not currently throw any IOException */ @Override public Serialization<S> getSerialization() throws IOException { Class<Serialization<S>> tClass = (Class<Serialization<S>>) conf.getClass( SerializationImplKey, null, Serialization.class); return tClass == null ? null : (Serialization<S>) ReflectionUtils .newInstance(tClass, conf); } } /** * An {@link RecordReader} for plain files with {@link Deserializer} records * * Reads one row at a time of type R. R is intended to be a base class of * something such as: Record, Writable, Text, ... * */ @Deprecated public class FlatFileRecordReader<R> implements RecordReader<Void, FlatFileInputFormat.RowContainer<R>> { /** * An interface for a helper class for instantiating {@link Serialization} * classes. */ /** * The stream in use - is fsin if not compressed, otherwise, it is dcin. */ private final DataInputStream in; /** * The decompressed stream or null if the input is not decompressed. */ private final InputStream dcin; /** * The underlying stream. */ private final FSDataInputStream fsin; /** * For calculating progress. */ private final long end; /** * The constructed deserializer. */ private final Deserializer<R> deserializer; /** * Once EOF is reached, stop calling the deserializer. */ private boolean isEOF; /** * The JobConf which contains information needed to instantiate the correct * Deserializer. */ private final Configuration conf; /** * The actual class of the row's we are deserializing, not just the base * class. */ private final Class<R> realRowClass; /** * FlatFileRecordReader constructor constructs the underlying stream * (potentially decompressed) and creates the deserializer. * * @param conf * the jobconf * @param split * the split for this file */ public FlatFileRecordReader(Configuration conf, FileSplit split) throws IOException { final Path path = split.getPath(); FileSystem fileSys = path.getFileSystem(conf); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory( conf); final CompressionCodec codec = compressionCodecs.getCodec(path); this.conf = conf; fsin = fileSys.open(path); if (codec != null) { dcin = codec.createInputStream(fsin); in = new DataInputStream(dcin); } else { dcin = null; in = fsin; } isEOF = false; end = split.getLength(); // Instantiate a SerializationContext which this will use to lookup the // Serialization class and the // actual class being deserialized SerializationContext<R> sinfo; Class<SerializationContext<R>> sinfoClass = (Class<SerializationContext<R>>) conf .getClass(SerializationContextImplKey, SerializationContextFromConf.class); sinfo = ReflectionUtils.newInstance(sinfoClass, conf); // Get the Serialization object and the class being deserialized Serialization<R> serialization = sinfo.getSerialization(); realRowClass = (Class<R>) sinfo.getRealClass(); deserializer = serialization.getDeserializer(realRowClass); deserializer.open(in); } /** * The JobConf key of the SerializationContext to use. */ public static final String SerializationContextImplKey = "mapred.input.serialization.context_impl"; /** * @return null */ @Override public Void createKey() { return null; } /** * @return a new R instance. */ @Override public RowContainer<R> createValue() { RowContainer<R> r = new RowContainer<R>(); r.row = ReflectionUtils.newInstance(realRowClass, conf); return r; } /** * Returns the next row # and value. * * @param key * - void as these files have a value only * @param value * - the row container which is always re-used, but the internal * value may be set to a new Object * @return whether the key and value were read. True if they were and false * if EOF * @exception IOException * from the deserializer */ @Override public synchronized boolean next(Void key, RowContainer<R> value) throws IOException { if (isEOF || in.available() == 0) { isEOF = true; return false; } // the deserializer is responsible for actually reading each record from // the stream try { value.row = deserializer.deserialize(value.row); if (value.row == null) { isEOF = true; return false; } return true; } catch (EOFException e) { isEOF = true; return false; } } @Override public synchronized float getProgress() throws IOException { // this assumes no splitting if (end == 0) { return 0.0f; } else { // gives progress over uncompressed stream // assumes deserializer is not buffering itself return Math.min(1.0f, fsin.getPos() / (float) (end)); } } @Override public synchronized long getPos() throws IOException { // assumes deserializer is not buffering itself // position over uncompressed stream. not sure what // effect this has on stats about job return fsin.getPos(); } @Override public synchronized void close() throws IOException { // assuming that this closes the underlying streams deserializer.close(); } } protected boolean isSplittable(FileSystem fs, Path filename) { return false; } @Override public RecordReader<Void, RowContainer<T>> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { reporter.setStatus(split.toString()); return new FlatFileRecordReader<T>(job, (FileSplit) split); } }