/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.contrib.fileformat.base64;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import org.apache.commons.codec.binary.Base64;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.JobConfigurable;
import org.apache.hadoop.mapred.LineRecordReader;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
/**
* FileInputFormat for base64 encoded text files.
*
* Each line is a base64-encoded record. The key is a LongWritable which is the
* offset. The value is a BytesWritable containing the base64-decoded bytes.
*
* This class accepts a configurable parameter:
* "base64.text.input.format.signature"
*
* The UTF-8 encoded signature will be compared with the beginning of each
* decoded bytes. If they don't match, the record is discarded. If they match,
* the signature is stripped off the data.
*/
public class Base64TextInputFormat implements
InputFormat<LongWritable, BytesWritable>, JobConfigurable {
/**
* Base64LineRecordReader.
*
*/
public static class Base64LineRecordReader implements
RecordReader<LongWritable, BytesWritable>, JobConfigurable {
LineRecordReader reader;
Text text;
public Base64LineRecordReader(LineRecordReader reader) {
this.reader = reader;
text = reader.createValue();
}
@Override
public void close() throws IOException {
reader.close();
}
@Override
public LongWritable createKey() {
return reader.createKey();
}
@Override
public BytesWritable createValue() {
return new BytesWritable();
}
@Override
public long getPos() throws IOException {
return reader.getPos();
}
@Override
public float getProgress() throws IOException {
return reader.getProgress();
}
@Override
public boolean next(LongWritable key, BytesWritable value) throws IOException {
while (reader.next(key, text)) {
// text -> byte[] -> value
byte[] textBytes = text.getBytes();
int length = text.getLength();
// Trim additional bytes
if (length != textBytes.length) {
textBytes = Arrays.copyOf(textBytes, length);
}
byte[] binaryData = base64.decode(textBytes);
// compare data header with signature
int i;
for (i = 0; i < binaryData.length && i < signature.length
&& binaryData[i] == signature[i]; ++i) {
;
}
// return the row only if it's not corrupted
if (i == signature.length) {
value.set(binaryData, signature.length, binaryData.length
- signature.length);
return true;
}
}
// no more data
return false;
}
private byte[] signature;
private final Base64 base64 = createBase64();
@Override
public void configure(JobConf job) {
try {
String signatureString = job.get("base64.text.input.format.signature");
if (signatureString != null) {
signature = signatureString.getBytes("UTF-8");
} else {
signature = new byte[0];
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
}
TextInputFormat format;
JobConf job;
public Base64TextInputFormat() {
format = new TextInputFormat();
}
@Override
public void configure(JobConf job) {
this.job = job;
format.configure(job);
}
public RecordReader<LongWritable, BytesWritable> getRecordReader(
InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException {
reporter.setStatus(genericSplit.toString());
Base64LineRecordReader reader = new Base64LineRecordReader(
new LineRecordReader(job, (FileSplit) genericSplit));
reader.configure(job);
return reader;
}
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
return format.getSplits(job, numSplits);
}
/**
* Workaround an incompatible change from commons-codec 1.3 to 1.4.
* Since Hadoop has this jar on its classpath, we have no way of knowing
* which version we are running against.
*/
static Base64 createBase64() {
try {
// This constructor appeared in 1.4 and specifies that we do not want to
// line-wrap or use any newline separator
Constructor<Base64> ctor = Base64.class.getConstructor(int.class, byte[].class);
return ctor.newInstance(0, null);
} catch (NoSuchMethodException e) { // ie we are running 1.3
// In 1.3, this constructor has the same behavior, but in 1.4 the default
// was changed to add wrapping and newlines.
return new Base64();
} catch (InstantiationException e) {
throw new RuntimeException(e);
} catch (IllegalAccessException e) {
throw new RuntimeException(e);
} catch (InvocationTargetException e) {
throw new RuntimeException(e.getCause());
}
}
}