package ch.unibe.scg.cc; import java.io.IOException; import java.io.Serializable; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.eclipse.jgit.lib.Constants; import ch.unibe.scg.cc.Protos.GitRepo; import ch.unibe.scg.cells.Cell; import ch.unibe.scg.cells.Codec; import com.google.common.io.ByteStreams; import com.google.protobuf.ByteString; /** * File input format for bare git repositories. The input path must be the a * packfile in a git repo. The only two files that are looked at is the packfile * and the pack refs. */ class GitInputFormat extends FileInputFormat<ImmutableBytesWritable, ImmutableBytesWritable> implements Serializable { final private static long serialVersionUID = 1L; /** Codec that reads the input format into a GitRepo compatible with GitInputFormat. */ static final class GitRepoCodec implements Codec<GitRepo> { final private static long serialVersionUID = 1L; final private static byte[] COL_KEY = ByteString.copyFromUtf8("project").toByteArray(); @Override public Cell<GitRepo> encode(GitRepo r) { return Cell.make(ByteString.copyFromUtf8(r.getProjectName()), ByteString.copyFrom(COL_KEY), r.toByteString()); } @Override public GitRepo decode(Cell<GitRepo> encoded) throws IOException { return GitRepo.parseFrom(encoded.getCellContents()); } } private static class GitPathRecordReader extends RecordReader<ImmutableBytesWritable, ImmutableBytesWritable> { private static final long MAX_PACKFILE_SIZE_MB = 500; private ImmutableBytesWritable currentKey; private ImmutableBytesWritable currentValue; private boolean isFinished; private FileSplit split; private FileSystem fs; @Override public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { split = (FileSplit) inputSplit; // Cast will always hold for FileInputFormats. // Taken from SequenceFileRecordReader#initialize. fs = split.getPath().getFileSystem(taskAttemptContext.getConfiguration()); // Again, this line is stolen from SequenceFileRecordReader. } @Override public boolean nextKeyValue() throws IOException, InterruptedException { if (isFinished) { return false; } // TODO: Delete this crutch. // It's here because there's an irrelevant file in the input … if (split.getPath().getName().equals("index")) { return false; } // Find packFilePath. FileStatus[] potentialPackFiles = fs.listStatus(new Path(split.getPath(), "objects/pack/")); Path packFilePath = null; for (FileStatus f : potentialPackFiles) { if (f.getPath().getName().endsWith(".pack")) { // Found pack file. if (packFilePath != null) { throw new IOException("We accept only fully packed git repos as " + "input. But there were two pack files."); } packFilePath = f.getPath(); } } if (packFilePath == null) { throw new IOException("There was no pack file in the repo!"); } // Check pack file size because with protobuf 2.4.0a the pack file gets copied 3 times in memory. if (fs.getFileStatus(packFilePath).getLen() / 1048576L > MAX_PACKFILE_SIZE_MB) { throw new IOException("Pack file exceeded size limit of " + MAX_PACKFILE_SIZE_MB + " MB."); } byte[] packFile; try (FSDataInputStream fsin = fs.open(packFilePath)) { packFile = ByteStreams.toByteArray(fsin); } Path packRefsPath = new Path(split.getPath(), Constants.PACKED_REFS); byte[] packRefs; try (FSDataInputStream fsin = fs.open(packRefsPath)) { packRefs = ByteStreams.toByteArray(fsin); } Cell<GitRepo> cell = new GitRepoCodec().encode( GitRepo.newBuilder() .setProjectName(packFilePath.toString()) .setPackFile(ByteString.copyFrom(packFile)) .setPackRefs(ByteString.copyFrom(packRefs)).build()); currentKey = new ImmutableBytesWritable(cell.getRowKey().toByteArray()); currentValue = new ImmutableBytesWritable(cell.getCellContents().toByteArray()); isFinished = true; return true; } @Override public float getProgress() throws IOException, InterruptedException { if (isFinished) { return 1.0f; } return 0.0f; } @Override public ImmutableBytesWritable getCurrentKey() throws IOException, InterruptedException { return currentKey; } @Override public ImmutableBytesWritable getCurrentValue() throws IOException, InterruptedException { return currentValue; } @Override public void close() throws IOException { if (fs != null) { fs.close(); fs = null; } } } @Override protected boolean isSplitable(JobContext context, Path filename) { return false; } @Override public RecordReader<ImmutableBytesWritable, ImmutableBytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { return new GitPathRecordReader(); } }