/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.hive; import java.io.IOException; import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat; import org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat; import org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat; import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat; import org.apache.hadoop.hive.ql.io.orc.OrcSerde; import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; import org.apache.hadoop.hive.serde2.SerDe; import org.apache.hadoop.hive.serde2.avro.AvroSerDe; import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; import org.apache.hadoop.mapred.TextInputFormat; import com.google.common.base.Enums; import com.google.common.base.Optional; import com.google.common.base.Preconditions; import gobblin.annotation.Alpha; import gobblin.configuration.State; /** * A wrapper around {@link SerDe} that bundles input format, output format and file extension with a {@link SerDe}, * and provides additional functionalities. * * @author Ziyang Liu */ @Alpha @SuppressWarnings("deprecation") public class HiveSerDeWrapper { private static final String SERDE_SERIALIZER_PREFIX = "serde.serializer."; private static final String SERDE_DESERIALIZER_PREFIX = "serde.deserializer."; public static final String SERDE_SERIALIZER_TYPE = SERDE_SERIALIZER_PREFIX + "type"; public static final String SERDE_SERIALIZER_INPUT_FORMAT_TYPE = SERDE_SERIALIZER_PREFIX + "input.format.type"; public static final String SERDE_SERIALIZER_OUTPUT_FORMAT_TYPE = SERDE_SERIALIZER_PREFIX + "output.format.type"; public static final String SERDE_DESERIALIZER_TYPE = SERDE_DESERIALIZER_PREFIX + "type"; public static final String SERDE_DESERIALIZER_INPUT_FORMAT_TYPE = SERDE_DESERIALIZER_PREFIX + "input.format.type"; public static final String SERDE_DESERIALIZER_OUTPUT_FORMAT_TYPE = SERDE_DESERIALIZER_PREFIX + "output.format.type"; public enum BuiltInHiveSerDe { AVRO(AvroSerDe.class.getName(), AvroContainerInputFormat.class.getName(), AvroContainerOutputFormat.class.getName()), ORC(OrcSerde.class.getName(), OrcInputFormat.class.getName(), OrcOutputFormat.class.getName()), PARQUET(ParquetHiveSerDe.class.getName(), MapredParquetInputFormat.class.getName(), MapredParquetOutputFormat.class.getName()), TEXTFILE(LazySimpleSerDe.class.getName(), TextInputFormat.class.getName(), HiveIgnoreKeyTextOutputFormat.class.getName()); private final String serDeClassName; private final String inputFormatClassName; private final String outputFormatClassName; private BuiltInHiveSerDe(String serDeClassName, String inputFormatClassName, String outputFormatClassName) { this.serDeClassName = serDeClassName; this.inputFormatClassName = inputFormatClassName; this.outputFormatClassName = outputFormatClassName; } @Override public String toString() { return this.serDeClassName; } } private Optional<SerDe> serDe = Optional.absent(); private final String serDeClassName; private final String inputFormatClassName; private final String outputFormatClassName; private HiveSerDeWrapper(BuiltInHiveSerDe hiveSerDe) { this(hiveSerDe.serDeClassName, hiveSerDe.inputFormatClassName, hiveSerDe.outputFormatClassName); } private HiveSerDeWrapper(String serDeClassName, String inputFormatClassName, String outputFormatClassName) { this.serDeClassName = serDeClassName; this.inputFormatClassName = inputFormatClassName; this.outputFormatClassName = outputFormatClassName; } /** * Get the {@link SerDe} instance associated with this {@link HiveSerDeWrapper}. * This method performs lazy initialization. */ public SerDe getSerDe() throws IOException { if (!this.serDe.isPresent()) { try { this.serDe = Optional.of(SerDe.class.cast(Class.forName(this.serDeClassName).newInstance())); } catch (Throwable t) { throw new IOException("Failed to instantiate SerDe " + this.serDeClassName, t); } } return this.serDe.get(); } /** * Get the input format class name associated with this {@link HiveSerDeWrapper}. */ public String getInputFormatClassName() { return this.inputFormatClassName; } /** * Get the output format class name associated with this {@link HiveSerDeWrapper}. */ public String getOutputFormatClassName() { return this.outputFormatClassName; } /** * Get an instance of {@link HiveSerDeWrapper}. * * @param serDeType The SerDe type. This should be one of the available {@link HiveSerDeWrapper.BuiltInHiveSerDe}s. */ public static HiveSerDeWrapper get(String serDeType) { return get(serDeType, Optional.<String> absent(), Optional.<String> absent()); } /** * Get an instance of {@link HiveSerDeWrapper}. * * @param serDeType The SerDe type. If serDeType is one of the available {@link HiveSerDeWrapper.BuiltInHiveSerDe}, * the other three parameters are not used. Otherwise, serDeType should be the class name of a {@link SerDe}, * and the other three parameters must be present. */ public static HiveSerDeWrapper get(String serDeType, Optional<String> inputFormatClassName, Optional<String> outputFormatClassName) { Optional<BuiltInHiveSerDe> hiveSerDe = Enums.getIfPresent(BuiltInHiveSerDe.class, serDeType.toUpperCase()); if (hiveSerDe.isPresent()) { return new HiveSerDeWrapper(hiveSerDe.get()); } Preconditions.checkArgument(inputFormatClassName.isPresent(), "Missing input format class name for SerDe " + serDeType); Preconditions.checkArgument(outputFormatClassName.isPresent(), "Missing output format class name for SerDe " + serDeType); return new HiveSerDeWrapper(serDeType, inputFormatClassName.get(), outputFormatClassName.get()); } /** * Get an instance of {@link HiveSerDeWrapper} from a {@link State}. * * @param state The state should contain property {@link #SERDE_SERIALIZER_TYPE}, and optionally contain properties * {@link #SERDE_SERIALIZER_INPUT_FORMAT_TYPE}, {@link #SERDE_SERIALIZER_OUTPUT_FORMAT_TYPE} and */ public static HiveSerDeWrapper getSerializer(State state) { Preconditions.checkArgument(state.contains(SERDE_SERIALIZER_TYPE), "Missing required property " + SERDE_SERIALIZER_TYPE); return get(state.getProp(SERDE_SERIALIZER_TYPE), Optional.fromNullable(state.getProp(SERDE_SERIALIZER_INPUT_FORMAT_TYPE)), Optional.fromNullable(state.getProp(SERDE_SERIALIZER_OUTPUT_FORMAT_TYPE))); } /** * Get an instance of {@link HiveSerDeWrapper} from a {@link State}. * * @param state The state should contain property {@link #SERDE_DESERIALIZER_TYPE}, and optionally contain properties * {@link #SERDE_DESERIALIZER_INPUT_FORMAT_TYPE}, {@link #SERDE_DESERIALIZER_OUTPUT_FORMAT_TYPE} and */ public static HiveSerDeWrapper getDeserializer(State state) { Preconditions.checkArgument(state.contains(SERDE_DESERIALIZER_TYPE), "Missing required property " + SERDE_DESERIALIZER_TYPE); return get(state.getProp(SERDE_DESERIALIZER_TYPE), Optional.fromNullable(state.getProp(SERDE_DESERIALIZER_INPUT_FORMAT_TYPE)), Optional.fromNullable(state.getProp(SERDE_DESERIALIZER_OUTPUT_FORMAT_TYPE))); } }