package edu.isi.karma.mapreduce.inputformat; import java.io.IOException; import java.lang.reflect.Field; import java.nio.charset.Charset; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class CSVBatchTextInputFormat extends FileInputFormat<Writable, Text> { private static Logger LOG = LoggerFactory.getLogger(CSVBatchTextInputFormat.class); @Override public RecordReader<Writable, Text> createRecordReader( InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { RecordReader<Writable, Text> recordReader = new CSVBatchRecordReader(); recordReader.initialize(split, context); return recordReader; } @Override protected boolean isSplitable(JobContext context, Path filename) { return false; } static { try { Field defaultCharsetField = Charset.class.getDeclaredField("defaultCharset"); defaultCharsetField.setAccessible(true); defaultCharsetField.set(null, Charset.forName("UTF-8")); } catch (Exception e) { LOG.error("something wrong", e); } } }