/*
* Licensed to Think Big Analytics, Inc. under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. Think Big Analytics, Inc. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Copyright 2010 Think Big Analytics. All Rights Reserved.
*/
package tap.core;
import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import org.apache.avro.Schema;
import org.apache.avro.generic.*;
import org.apache.avro.io.*;
import org.apache.avro.mapred.*;
import org.apache.avro.specific.SpecificDatumReader;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.ReflectionUtils;
import org.codehaus.jackson.JsonParseException;
import tap.Phase;
import tap.Pipe;
import tap.TapMapper;
import tap.core.io.BinaryKey;
import tap.core.mapreduce.input.TapfileInputFormat;
import tap.core.mapreduce.io.ProtobufWritable;
import tap.formats.FileFormat;
import tap.formats.Formats;
import tap.formats.avro.JsonToGenericRecord;
import tap.util.ObjectFactory;
import tap.util.ReflectUtils;
/* nb, KO, VO are set in Phase. Since these values determine the types for the OutputCollector,
* we cannot have map-only tasks write to tap files, unless we change this.
* conf.setMapOutputKeyClass(AvroKey.class);
conf.setMapOutputValueClass(AvroValue.class);
*/
@SuppressWarnings("deprecation")
public class MapperBridge<KEY, VALUE, IN, OUT, KO, VO> extends MapReduceBase
implements org.apache.hadoop.mapred.Mapper<KEY, VALUE, KO, VO> {
private static final int SNIFF_HEADER_SIZE = 1000;
private TapMapper<IN, OUT> mapper;
private boolean isMapOnly;
//private OUT out;
private TapContext<OUT> context;
private Schema schema;
private String groupBy;
private String sortBy;
private boolean isTextInput = false;
private boolean isStringInput = false;
private boolean isJsonInput = false;
private boolean isProtoInput = false;
private Schema inSchema;
private int parseErrors = 0;
private BinaryEncoder encoder = null;
private EncoderFactory factory = new EncoderFactory();
// TODO: make this configurable
private int maxAllowedErrors = 1000;
private Pipe<OUT> outPipe = null;
@SuppressWarnings("unchecked")
@Override
public void configure(JobConf conf) {
this.mapper = ReflectionUtils.newInstance(
conf.getClass(Phase.MAPPER, TapMapper.class, TapMapper.class),
conf);
this.isMapOnly = conf.getNumReduceTasks() == 0;
try {
determineInputFormat(conf);
determineOutputFormat(conf);
this.groupBy = conf.get(Phase.GROUP_BY);
this.sortBy = conf.get(Phase.SORT_BY);
} catch (Exception e) {
if (e instanceof RuntimeException)
throw (RuntimeException) e;
throw new RuntimeException(e);
}
mapper.setConf(conf);
mapper.init(conf.get("map.input.file"));
}
@SuppressWarnings("unchecked")
@Override
public void map(KEY wrapper, VALUE value,
OutputCollector<KO, VO> collector, Reporter reporter)
throws IOException {
if (this.context == null) {
KeyExtractor<BinaryKey, OUT> extractor = new ReflectionKeyExtractor<OUT>(
schema, groupBy, sortBy);
this.context = new TapContext<OUT>(new Collector(collector,
extractor), reporter);
}
bindContextToPipe(collector, reporter);
invokeMapper(wrapper, value, reporter);
}
/**
* @param conf
* @throws IOException
* @throws FileNotFoundException
*/
private void determineInputFormat(JobConf conf)
throws FileNotFoundException, IOException {
/**
* Compare mapper input file signature with Hadoop configured class
*/
FileFormat ff = sniffMapInFormat(conf);
if (!ff.isCompatible(conf.getInputFormat())) {
throw new IllegalArgumentException("Map input format not compatible with file format.");
}
//otherwise assume it is avro?
if (conf.getInputFormat() instanceof TextInputFormat) {
Class<?> inClass = conf.getClass(Phase.MAP_IN_CLASS, Object.class,
Object.class);
if (inClass == String.class) {
isStringInput = true;
} else if (inClass == Text.class) {
isTextInput = true;
} else {
isJsonInput = true;
inSchema = ReflectUtils.getSchema((IN) ReflectionUtils.newInstance(
inClass, conf));
}
}
isProtoInput = conf.getInputFormat() instanceof TapfileInputFormat;
}
/**
* @param conf
*/
@SuppressWarnings("unchecked")
private void determineOutputFormat(JobConf conf) throws Exception {
Class<?> outClass = conf.getClass(Phase.MAP_OUT_CLASS, Object.class, Object.class);
OUT out;
out = (OUT)ObjectFactory.newInstance(outClass);
/*OUT out = (OUT) ReflectionUtils.newInstance(
conf.getClass(Phase.MAP_OUT_CLASS, Object.class, Object.class),
conf);
*/
outPipe = new Pipe<OUT>(out);
schema = ReflectUtils.getSchema(out);
}
@SuppressWarnings("unchecked")
private class Collector<K> extends AvroMultiCollector<OUT> {
private final AvroWrapper<OUT> wrapper = new AvroWrapper<OUT>(null);
private final AvroKey<K> keyWrapper = new AvroKey<K>(null);
private final AvroValue<OUT> valueWrapper = new AvroValue<OUT>(null);
private final KeyExtractor<K, OUT> extractor;
private final K key;
private OutputCollector<KO, VO> collector;
//map only jobs that write to tapproto file
private ProtobufWritable protobufWritable = new ProtobufWritable();
public Collector(OutputCollector<KO, VO> collector,
KeyExtractor<K, OUT> extractor) {
this.collector = collector;
this.extractor = extractor;
key = extractor.getProtypeKey();
keyWrapper.datum(key);
}
public void collect(OUT datum) throws IOException {
if (isMapOnly) {
//if the the output from the mapper is a protobuf message, then KO is a BinaryKey and VO is a ProtobufWritable
if(datum instanceof com.google.protobuf.Message)
{
extractor.setKey(datum, key);
protobufWritable.setConverter(datum.getClass());
protobufWritable.set(datum);
collector.collect((KO)key, (VO)protobufWritable);
}
else
{
wrapper.datum(datum);
collector.collect((KO) wrapper, (VO) NullWritable.get());
}
} else {
extractor.setKey(datum, key);
valueWrapper.datum(datum);
collector.collect((KO) keyWrapper, (VO) valueWrapper);
}
}
}
/**
* @param wrapper
* @param value
* @param reporter
* @throws IOException
*/
private void invokeMapper(KEY wrapper, VALUE value, Reporter reporter)
throws IOException {
if (isTextInput) {
map((IN) value);
} else if (isStringInput) {
map((IN) ((Text) value).toString());
} else if (isProtoInput) {
map((IN) ((ProtobufWritable) value).get());
} else if (isJsonInput) {
String json = ((Text) value).toString();
if (shouldSkip(json))
return;
// inefficient implementation of json to avro...
// more efficient would be JsonToClass.jsonToRecord:
// mapper.map((IN) JsonToClass.jsonToRecord(json, inSchema), out,
// context);
// silly conversion approach - serialize then deserialize
try {
GenericContainer c = JsonToGenericRecord.jsonToRecord(json,
inSchema);
GenericDatumWriter<GenericContainer> writer = new GenericDatumWriter<GenericContainer>(
inSchema);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
writer.setSchema(inSchema);
encoder = factory.binaryEncoder(bos, encoder);
writer.write(c, encoder);
byte[] data = bos.toByteArray();
GenericDatumReader<IN> reader = new SpecificDatumReader<IN>(
inSchema);
reader.setSchema(inSchema);
IN converted = reader.read(null, DecoderFactory
.defaultFactory().createBinaryDecoder(data, null));
map(converted);
} catch (JsonParseException jpe) {
System.err.println("Failed to parse " + json + ": "
+ jpe.getMessage());
reporter.incrCounter("ColHadoopMapper", "json-parse-error", 1L);
if (++parseErrors > maxAllowedErrors) {
throw new RuntimeException(jpe);
}
}
} else {
map(((AvroWrapper<IN>) wrapper).datum());
}
}
private void map(IN value) {
mapper.map(value, outPipe);
}
/**
* Bind the output to the Tap Context
*
* @param collector
* @param reporter
*/
private void bindContextToPipe(OutputCollector<KO, VO> collector,
Reporter reporter) {
if (this.outPipe != null && this.outPipe.getContext() == null) {
if (this.context == null) {
KeyExtractor<BinaryKey, OUT> extractor = new ReflectionKeyExtractor<OUT>(
schema, groupBy, sortBy);
outPipe.setContext(new TapContext<OUT>(new Collector(
collector, extractor), reporter));
} else {
outPipe.setContext(this.context);
}
}
}
private boolean shouldSkip(String json) {
int i;
int len = json.length();
for (i = 0; i < len; i++)
if (!Character.isWhitespace(json.charAt(i)))
break;
if (i == len)
return true; // blank line
return (json.charAt(i) == '#' || json.charAt(i) == '/' && len > (i + 1)
&& json.charAt(i + 1) == '/'); // skip comments
}
/**
* Open file and read header to determine file format
*
* @param conf
* @throws IOException
* @throws FileNotFoundException
*/
private FileFormat sniffMapInFormat(JobConf conf) throws IOException,
FileNotFoundException {
{
FileFormat returnFormat = Formats.UNKNOWN_FORMAT.getFileFormat();
Path path = new Path(conf.get("map.input.file"));
// System.out.println("PATH is " + path);
FileSystem fs = path.getFileSystem(conf);
FSDataInputStream in = null;
try {
in = fs.open(path);
byte[] header = readHeader(in);
returnFormat = determineFileFormat(header);
} finally {
if(in != null)
in.close();
}
return returnFormat;
/*
InputFormat inputFormat = conf.getInputFormat();
System.out
.println("tap.core.MapperBridge: local file path " + path);
System.out.println("tap.core.MapperBridge: File format "
+ returnFormat.toString());
System.out.println("tap.core.MapperBridge: format extension "
+ returnFormat.fileExtension());
*/
}
}
/**
* Based on file header values return File format.
*
* @param returnFormat
* @param header
* @return
*/
private FileFormat determineFileFormat(byte[] header) {
for (Formats format : Formats.values()) {
if (format.getFileFormat().signature(header)) {
return format.getFileFormat();
}
}
return Formats.UNKNOWN_FORMAT.getFileFormat();
}
/**
* Read first N bytes from normal file system file.
* @param file
* @return byte buffer containing first SNIFF_HEADER_SIZE bytes.
* @throws FileNotFoundException
* @throws IOException
*/
private byte[] readHeader(FSDataInputStream inputStream) throws FileNotFoundException,
IOException {
byte[] header = new byte[SNIFF_HEADER_SIZE];
inputStream.read(header);
inputStream.close();
return header;
}
@Override
public void close() throws IOException {
mapper.close(outPipe);
}
}