package edu.isi.karma.mapreduce.inputformat; import java.io.IOException; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileAsTextInputFormat; import org.apache.hadoop.mapred.SequenceFileAsTextRecordReader; @InterfaceAudience.Public @InterfaceStability.Stable public class SequenceFileAsLineInputFormat extends SequenceFileAsTextInputFormat { public SequenceFileAsLineInputFormat() { super(); } @Override public org.apache.hadoop.mapred.RecordReader<Text,Text> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException { reporter.setStatus(split.toString()); return new SequenceRecorderLineReader(job, (FileSplit) split); } public class SequenceRecorderLineReader extends SequenceFileAsTextRecordReader { public SequenceRecorderLineReader(Configuration conf, FileSplit split) throws IOException { super(conf, split); } @Override /** Read key/value pair in a line. */ public synchronized boolean next(Text key, Text value) throws IOException { boolean returnVal = super.next(key, value); if(returnVal) { String valueStr = value.toString().replace("\n", " ").replace("\r", " "); value.set(valueStr); } return returnVal; } } }