package hip.ch3.csv; import au.com.bytecode.opencsv.CSVParser; import hip.ch3.TextArrayWritable; import hip.util.HadoopCompat; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.LineRecordReader; import java.io.IOException; /** * An {@link org.apache.hadoop.mapreduce.InputFormat} for CSV * plain text files. Keys are byte offsets in * the file, and values are {@link org.apache.hadoop.io.ArrayWritable}'s with tokenized * values. */ public class CSVInputFormat extends FileInputFormat<LongWritable, TextArrayWritable> { public static String CSV_TOKEN_SEPARATOR_CONFIG = "csvinputformat.token.delimiter"; @Override public RecordReader<LongWritable, TextArrayWritable> createRecordReader(InputSplit split, TaskAttemptContext context) { String csvDelimiter = HadoopCompat.getConfiguration(context).get( //<co id="ch02_comment_csv_inputformat1"/> CSV_TOKEN_SEPARATOR_CONFIG); Character separator = null; if(csvDelimiter != null && csvDelimiter.length() == 1) { separator = csvDelimiter.charAt(0); } return new CSVRecordReader(separator); //<co id="ch02_comment_csv_inputformat2"/> } @Override protected boolean isSplitable(JobContext context, Path file) { CompressionCodec codec = new CompressionCodecFactory(HadoopCompat.getConfiguration(context)) .getCodec(file); return codec == null; //<co id="ch02_comment_csv_inputformat3"/> } public static class CSVRecordReader //<co id="ch02_comment_csv_inputformat4"/> extends RecordReader<LongWritable, TextArrayWritable> { private LineRecordReader reader; private TextArrayWritable value; private final CSVParser parser; public CSVRecordReader(Character csvDelimiter) { this.reader = new LineRecordReader(); if (csvDelimiter == null) { parser = new CSVParser(); //<co id="ch02_comment_csv_inputformat5"/> } else { parser = new CSVParser(csvDelimiter); } } @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { reader.initialize(split, context); //<co id="ch02_comment_csv_inputformat6"/> } @Override public boolean nextKeyValue() throws IOException, InterruptedException { if (reader.nextKeyValue()) { //<co id="ch02_comment_csv_inputformat7"/> loadCSV(); //<co id="ch02_comment_csv_inputformat8"/> return true; } else { value = null; return false; } } private void loadCSV() throws IOException { //<co id="ch02_comment_csv_inputformat9"/> String line = reader.getCurrentValue().toString(); String[] tokens = parser.parseLine(line); //<co id="ch02_comment_csv_inputformat10"/> value = new TextArrayWritable(convert(tokens)); } private Text[] convert(String[] s) { Text t[] = new Text[s.length]; for(int i=0; i < t.length; i++) { t[i] = new Text(s[i]); } return t; } @Override public LongWritable getCurrentKey() //<co id="ch02_comment_csv_inputformat11"/> throws IOException, InterruptedException { return reader.getCurrentKey(); } @Override public TextArrayWritable getCurrentValue() //<co id="ch02_comment_csv_inputformat12"/> throws IOException, InterruptedException { return value; } @Override public float getProgress() throws IOException, InterruptedException { return reader.getProgress(); } @Override public void close() throws IOException { reader.close(); } } }