Base64TextInputFormat.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.contrib.fileformat.base64;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;

import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.LineRecordReader;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;

/**
 * FileInputFormat for base64 encoded text files.
 * 
 * Each line is a base64-encoded record. The key is a LongWritable which is the
 * offset. The value is a BytesWritable containing the base64-decoded bytes.
 * 
 * This class accepts a configurable parameter:
 * "base64.text.input.format.signature"
 * 
 * The UTF-8 encoded signature will be compared with the beginning of each
 * decoded bytes. If they don't match, the record is discarded. If they match,
 * the signature is stripped off the data.
 */
public class Base64TextInputFormat implements
    InputFormat<LongWritable, BytesWritable>, JobConfigurable {

  /**
   * Base64LineRecordReader.
   *
   */
  public static class Base64LineRecordReader implements
      RecordReader<LongWritable, BytesWritable>, JobConfigurable {

    LineRecordReader reader;
    Text text;

    public Base64LineRecordReader(LineRecordReader reader) {
      this.reader = reader;
      text = reader.createValue();
    }

    @Override
    public void close() throws IOException {
      reader.close();
    }

    @Override
    public LongWritable createKey() {
      return reader.createKey();
    }

    @Override
    public BytesWritable createValue() {
      return new BytesWritable();
    }

    @Override
    public long getPos() throws IOException {
      return reader.getPos();
    }

    @Override
    public float getProgress() throws IOException {
      return reader.getProgress();
    }

    @Override
    public boolean next(LongWritable key, BytesWritable value) throws IOException {
      while (reader.next(key, text)) {
        // text -> byte[] -> value
        byte[] textBytes = text.getBytes();
        int length = text.getLength();

        // Trim additional bytes
        if (length != textBytes.length) {
          textBytes = Arrays.copyOf(textBytes, length);
        }
        byte[] binaryData = base64.decode(textBytes);

        // compare data header with signature
        int i;
        for (i = 0; i < binaryData.length && i < signature.length
            && binaryData[i] == signature[i]; ++i) {
          ;
        }

        // return the row only if it's not corrupted
        if (i == signature.length) {
          value.set(binaryData, signature.length, binaryData.length
              - signature.length);
          return true;
        }
      }
      // no more data
      return false;
    }

    private byte[] signature;
    private final Base64 base64 = createBase64();

    @Override
    public void configure(JobConf job) {
      try {
        String signatureString = job.get("base64.text.input.format.signature");
        if (signatureString != null) {
          signature = signatureString.getBytes("UTF-8");
        } else {
          signature = new byte[0];
        }
      } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
      }
    }

  }

  TextInputFormat format;
  JobConf job;

  public Base64TextInputFormat() {
    format = new TextInputFormat();
  }

  @Override
  public void configure(JobConf job) {
    this.job = job;
    format.configure(job);
  }

  public RecordReader<LongWritable, BytesWritable> getRecordReader(
      InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException {
    reporter.setStatus(genericSplit.toString());
    Base64LineRecordReader reader = new Base64LineRecordReader(
        new LineRecordReader(job, (FileSplit) genericSplit));
    reader.configure(job);
    return reader;
  }

  @Override
  public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    return format.getSplits(job, numSplits);
  }

  /**
   * Workaround an incompatible change from commons-codec 1.3 to 1.4.
   * Since Hadoop has this jar on its classpath, we have no way of knowing
   * which version we are running against.
   */
  static Base64 createBase64() {
    try {
      // This constructor appeared in 1.4 and specifies that we do not want to
      // line-wrap or use any newline separator
      Constructor<Base64> ctor = Base64.class.getConstructor(int.class, byte[].class);
      return ctor.newInstance(0, null);
    } catch (NoSuchMethodException e) { // ie we are running 1.3
      // In 1.3, this constructor has the same behavior, but in 1.4 the default
      // was changed to add wrapping and newlines.
      return new Base64();
    } catch (InstantiationException e) {
      throw new RuntimeException(e);
    } catch (IllegalAccessException e) {
      throw new RuntimeException(e);
    } catch (InvocationTargetException e) {
      throw new RuntimeException(e.getCause());
    }
  }

}