package hip.ch3.csv; import hip.ch3.TextArrayWritable; import hip.util.HadoopCompat; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.ReflectionUtils; import java.io.DataOutputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; public class CSVOutputFormat extends TextOutputFormat<TextArrayWritable, NullWritable> { public static String CSV_TOKEN_SEPARATOR_CONFIG = "csvoutputformat.token.delimiter"; @Override public RecordWriter getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { Configuration conf = HadoopCompat.getConfiguration(job); boolean isCompressed = getCompressOutput(job); String keyValueSeparator = conf.get(CSV_TOKEN_SEPARATOR_CONFIG, ","); CompressionCodec codec = null; String extension = ""; if (isCompressed) { Class<? extends CompressionCodec> codecClass = getOutputCompressorClass(job, GzipCodec.class); codec = ReflectionUtils.newInstance(codecClass, conf); extension = codec.getDefaultExtension(); } Path file = getDefaultWorkFile(job, extension); FileSystem fs = file.getFileSystem(conf); if (!isCompressed) { FSDataOutputStream fileOut = fs.create(file, false); return new CSVRecordWriter(fileOut, keyValueSeparator); } else { FSDataOutputStream fileOut = fs.create(file, false); return new CSVRecordWriter( new DataOutputStream(codec.createOutputStream(fileOut)), keyValueSeparator); } } protected static class CSVRecordWriter extends RecordWriter<TextArrayWritable, NullWritable> { private static final String utf8 = "UTF-8"; private static final byte[] newline; static { try { newline = "\n".getBytes(utf8); } catch (UnsupportedEncodingException uee) { throw new IllegalArgumentException("can't find " + utf8 + " encoding"); } } protected DataOutputStream out; private final String csvSeparator; public CSVRecordWriter(DataOutputStream out, String csvSeparator) { this.out = out; this.csvSeparator = csvSeparator; } @Override public void write(TextArrayWritable key, NullWritable value) throws IOException, InterruptedException { if (key == null) { return; } boolean first = true; for (Writable field : key.get()) { writeObject(first, field); first = false; } out.write(newline); } /** * Write the object to the byte stream, handling Text as a special * case. * * @param o the object to print * @throws java.io.IOException if the write throws, we pass it on */ private void writeObject(boolean first, Writable o) throws IOException { if(!first) { out.write(csvSeparator.getBytes(utf8)); } boolean encloseQuotes = false; if (o.toString().contains(csvSeparator)) { encloseQuotes = true; } if(encloseQuotes) { out.write("\"".getBytes(utf8)); } if (o instanceof Text) { Text to = (Text) o; out.write(to.getBytes(), 0, to.getLength()); } else { out.write(o.toString().getBytes(utf8)); } if(encloseQuotes) { out.write("\"".getBytes(utf8)); } } public synchronized void close(TaskAttemptContext context) throws IOException { out.close(); } } }