package com.linkedin.camus.etl.kafka.mapred; import java.io.BufferedOutputStream; import java.io.IOException; import java.io.OutputStream; import java.lang.reflect.Constructor; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; import org.apache.log4j.Logger; import org.codehaus.jackson.map.ObjectMapper; import com.linkedin.camus.etl.RecordWriterProvider; import com.linkedin.camus.etl.kafka.common.EtlCounts; import com.linkedin.camus.etl.kafka.common.EtlKey; public class EtlMultiOutputCommitter extends FileOutputCommitter { private Pattern workingFileMetadataPattern; private HashMap<String, EtlCounts> counts = new HashMap<String, EtlCounts>(); private HashMap<String, EtlKey> offsets = new HashMap<String, EtlKey>(); private HashMap<String, Long> eventCounts = new HashMap<String, Long>(); private TaskAttemptContext context; private final RecordWriterProvider recordWriterProvider; private Logger log; private void mkdirs(FileSystem fs, Path path) throws IOException { if (!fs.exists(path.getParent())) { mkdirs(fs, path.getParent()); } fs.mkdirs(path); } public void addCounts(EtlKey key) throws IOException { String workingFileName = EtlMultiOutputFormat.getWorkingFileName(context, key); if (!counts.containsKey(workingFileName)) counts.put(workingFileName, new EtlCounts(key.getTopic(), EtlMultiOutputFormat.getMonitorTimeGranularityMs(context))); counts.get(workingFileName).incrementMonitorCount(key); addOffset(key); } public void addOffset(EtlKey key) { String topicPart = key.getTopic() + "-" + key.getLeaderId() + "-" + key.getPartition(); EtlKey offsetKey = new EtlKey(key); if (offsets.containsKey(topicPart)) { long totalSize = offsets.get(topicPart).getTotalMessageSize() + key.getMessageSize(); long avgSize = totalSize / (eventCounts.get(topicPart) + 1); offsetKey.setMessageSize(avgSize); offsetKey.setTotalMessageSize(totalSize); } else { eventCounts.put(topicPart, 0l); } eventCounts.put(topicPart, eventCounts.get(topicPart) + 1); offsets.put(topicPart, offsetKey); } public EtlMultiOutputCommitter(Path outputPath, TaskAttemptContext context, Logger log) throws IOException { super(outputPath, context); this.context = context; try { //recordWriterProvider = EtlMultiOutputFormat.getRecordWriterProviderClass(context).newInstance(); Class<RecordWriterProvider> rwp = EtlMultiOutputFormat.getRecordWriterProviderClass(context); Constructor<RecordWriterProvider> crwp = rwp.getConstructor(TaskAttemptContext.class); recordWriterProvider = crwp.newInstance(context); } catch (Exception e) { throw new IllegalStateException(e); } workingFileMetadataPattern = Pattern.compile( "data\\.([^\\.]+)\\.([\\d_]+)\\.(\\d+)\\.([^\\.]+)-m-\\d+" + recordWriterProvider.getFilenameExtension()); this.log = log; } @Override public void commitTask(TaskAttemptContext context) throws IOException { ArrayList<Map<String, Object>> allCountObject = new ArrayList<Map<String, Object>>(); FileSystem fs = FileSystem.get(context.getConfiguration()); if (EtlMultiOutputFormat.isRunMoveData(context)) { Path workPath = super.getWorkPath(); log.info("work path: " + workPath); Path baseOutDir = EtlMultiOutputFormat.getDestinationPath(context); log.info("Destination base path: " + baseOutDir); for (FileStatus f : fs.listStatus(workPath)) { String file = f.getPath().getName(); log.info("work file: " + file); if (file.startsWith("data")) { String workingFileName = file.substring(0, file.lastIndexOf("-m")); EtlCounts count = counts.get(workingFileName); count.setEndTime(System.currentTimeMillis()); String partitionedFile = getPartitionedPath(context, file, count.getEventCount(), count.getLastKey().getOffset()); Path dest = new Path(baseOutDir, partitionedFile); if (!fs.exists(dest.getParent())) { mkdirs(fs, dest.getParent()); } commitFile(context, f.getPath(), dest); log.info("Moved file from: " + f.getPath() + " to: " + dest); if (EtlMultiOutputFormat.isRunTrackingPost(context)) { count.writeCountsToMap(allCountObject, fs, new Path(workPath, EtlMultiOutputFormat.COUNTS_PREFIX + "." + dest.getName().replace(recordWriterProvider.getFilenameExtension(), ""))); } } } if (EtlMultiOutputFormat.isRunTrackingPost(context)) { Path tempPath = new Path(workPath, "counts." + context.getConfiguration().get("mapred.task.id")); OutputStream outputStream = new BufferedOutputStream(fs.create(tempPath)); ObjectMapper mapper = new ObjectMapper(); log.info("Writing counts to : " + tempPath.toString()); long time = System.currentTimeMillis(); mapper.writeValue(outputStream, allCountObject); log.debug("Time taken : " + (System.currentTimeMillis() - time) / 1000); } } else { log.info("Not moving run data."); } SequenceFile.Writer offsetWriter = SequenceFile.createWriter(fs, context.getConfiguration(), new Path(super.getWorkPath(), EtlMultiOutputFormat.getUniqueFile(context, EtlMultiOutputFormat.OFFSET_PREFIX, "")), EtlKey.class, NullWritable.class); for (String s : offsets.keySet()) { log.info("Avg record size for " + offsets.get(s).getTopic() + ":" + offsets.get(s).getPartition() + " = " + offsets.get(s).getMessageSize()); offsetWriter.append(offsets.get(s), NullWritable.get()); } offsetWriter.close(); super.commitTask(context); } protected void commitFile(JobContext job, Path source, Path target) throws IOException { log.info(String.format("Moving %s to %s", source, target)); if (!FileSystem.get(job.getConfiguration()).rename(source, target)) { log.error(String.format("Failed to move from %s to %s", source, target)); throw new IOException(String.format("Failed to move from %s to %s", source, target)); } } public String getPartitionedPath(JobContext context, String file, int count, long offset) throws IOException { Matcher m = workingFileMetadataPattern.matcher(file); if (!m.find()) { throw new IOException("Could not extract metadata from working filename '" + file + "'"); } String topic = m.group(1); String leaderId = m.group(2); String partition = m.group(3); String encodedPartition = m.group(4); String partitionedPath = EtlMultiOutputFormat.getPartitioner(context, topic).generatePartitionedPath(context, topic, encodedPartition); partitionedPath += "/" + EtlMultiOutputFormat.getPartitioner(context, topic).generateFileName(context, topic, leaderId, Integer.parseInt(partition), count, offset, encodedPartition); return partitionedPath + recordWriterProvider.getFilenameExtension(); } }