package ch.unibe.scg.cc; import static com.google.common.base.Preconditions.checkArgument; import java.io.IOException; import java.io.InputStream; import java.nio.file.FileVisitResult; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.SimpleFileVisitor; import java.nio.file.attribute.BasicFileAttributes; import java.util.ArrayList; import java.util.List; import java.util.Scanner; import java.util.concurrent.Callable; import java.util.concurrent.TimeUnit; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.inject.Inject; import org.eclipse.jgit.errors.MissingObjectException; import org.eclipse.jgit.lib.ObjectId; import org.eclipse.jgit.revwalk.RevWalk; import org.eclipse.jgit.storage.file.FileRepository; import org.eclipse.jgit.transport.PackParser; import org.eclipse.jgit.treewalk.TreeWalk; import org.eclipse.jgit.treewalk.filter.PathSuffixFilter; import ch.unibe.scg.cc.Annotations.MapsKilledDueToTimeout; import ch.unibe.scg.cc.Annotations.MissingObjectExceptions; import ch.unibe.scg.cc.Annotations.ProcessedFiles; import ch.unibe.scg.cc.Populator.ProjectRegistrar; import ch.unibe.scg.cc.Populator.VersionRegistrar; import ch.unibe.scg.cc.Protos.GitRepo; import ch.unibe.scg.cc.Protos.Snippet; import ch.unibe.scg.cells.Counter; import ch.unibe.scg.cells.Mapper; import ch.unibe.scg.cells.OneShotIterable; import ch.unibe.scg.cells.Sink; import com.google.common.base.Throwables; import com.google.common.collect.Iterables; import com.google.common.util.concurrent.SimpleTimeLimiter; /** GitWalker walks Git repositories and hands their files to the {@link Populator}. */ public class GitPopulator implements Mapper<GitRepo, Snippet> { final static private long serialVersionUID = 1L; final static private Pattern projectNameRegexNonBare = Pattern.compile(".+?/([^/]+)/.git/.*"); final static private Pattern projectNameRegexBare = Pattern.compile(".+?/([^/]+)/objects/.*"); final static private Logger logger = Logger.getLogger(GitPopulator.class.getName()); final static private long THREAD_TIMEOUT_IN_MINUTES = 15L; final private CharsetDetector charsetDetector; final private Populator populator; final private Counter killedMapCounter; final private Counter processedFilesCounter; final private Counter missingObjectCounter; @Inject GitPopulator(CharsetDetector charsetDetector, Populator populator, @MapsKilledDueToTimeout Counter killedMapCounter, @ProcessedFiles Counter processedFilesCounter, @MissingObjectExceptions Counter missingObjectCounter) { this.charsetDetector = charsetDetector; this.populator = populator; this.killedMapCounter = killedMapCounter; this.processedFilesCounter = processedFilesCounter; this.missingObjectCounter = missingObjectCounter; } static class PackedRefParser { final static Pattern pattern = Pattern.compile("([a-f0-9]{40}) refs\\/(?:tags|heads)\\/(.+)"); public List<PackedRef> parse(InputStream ins) throws IOException { int ch; StringBuilder content = new StringBuilder(); while ((ch = ins.read()) != -1) { content.append((char) ch); } return parse(content.toString()); } private List<PackedRef> parse(String content) { List<PackedRef> list = new ArrayList<>(); try (Scanner s = new Scanner(content)) { while (s.hasNextLine()) { String line = s.nextLine(); Matcher m = pattern.matcher(line); if (m.matches()) { String sha = m.group(1); assert sha.length() == 40; ObjectId key = ObjectId.fromString(sha); String name = m.group(2); PackedRef pr = new PackedRef(key, name); list.add(pr); } } } return list; } } static class PackedRef { final ObjectId key; final String name; PackedRef(ObjectId key, String name) { this.key = key; this.name = name; } public ObjectId getKey() { return key; } public String getName() { return name; } } /** Processes the Git repository and hands the files to the {@link Populator}. */ @Override public void map(final GitRepo repo, final OneShotIterable<GitRepo> row, final Sink<Snippet> sink) throws IOException, InterruptedException { checkArgument(Iterables.size(row) == 1); SimpleTimeLimiter stl = new SimpleTimeLimiter(); try { stl.callWithTimeout(new Callable<Void>() { @Override public Void call() throws IOException, InterruptedException { List<PackedRef> tags = new PackedRefParser().parse(repo.getPackRefs().newInput()); long processedFiles = 0L; Path unpackDir = null; try (ProjectRegistrar projectRegistrar = populator.makeProjectRegistrar(repo.getProjectName(), sink)) { unpackDir = Files.createTempDirectory(null); FileRepository r = new FileRepository(unpackDir.toFile()); r.create(true); PackParser pp = r.newObjectInserter().newPackParser(repo.getPackFile().newInput()); // ProgressMonitor set to null, so NullProgressMonitor will be used. pp.parse(null); for (PackedRef paref : tags) { logger.info("WALK TAG: " + paref.getName()); try (VersionRegistrar vr = projectRegistrar.makeVersionRegistrar(paref.getName())) { TreeWalk treeWalk = new TreeWalk(r); treeWalk.addTree(new RevWalk(r).parseCommit(paref.getKey()).getTree()); treeWalk.setRecursive(true); treeWalk.setFilter(PathSuffixFilter.create(".java")); while (treeWalk.next()) { // There's only one tree; it has index 0. ObjectId objectId = treeWalk.getObjectId(0); byte[] fileContents = treeWalk.getObjectReader().open(objectId).getBytes(); vr.makeFileRegistrar().register(treeWalk.getPathString(), new String(fileContents, charsetDetector.charsetOf(fileContents))); // We can't just increment the Hadoop counter directly here because otherwise // processed files of timeouted map tasks would get counted too. processedFiles++; if (Thread.currentThread().isInterrupted()) { // We just return here because counters only count with successful map tasks. killedMapCounter.increment(1L); return null; } } } catch (MissingObjectException moe) { missingObjectCounter.increment(1L); } } processedFilesCounter.increment(processedFiles); } finally { if (unpackDir != null) { try { removeRecursive(unpackDir); } catch (IOException e) { logger.warning("Failed to delete " + unpackDir + " because " + e); } } } logger.info("Finished processing: " + repo.getProjectName()); return null; } }, THREAD_TIMEOUT_IN_MINUTES, TimeUnit.MINUTES, true); } catch (Exception e) { Throwables.propagateIfPossible(e, IOException.class, InterruptedException.class); throw new RuntimeException("This shouldn't happen. Exception is neither IOException or InterruptedException."); } } /** Taken from http://stackoverflow.com/questions/779519/delete-files-recursively-in-java/8685959#8685959 */ private static void removeRecursive(Path path) throws IOException { Files.walkFileTree(path, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { Files.delete(file); return FileVisitResult.CONTINUE; } @Override public FileVisitResult visitFileFailed(Path file, IOException e) throws IOException { // try to delete the file anyway, even if its attributes // could not be read, since delete-only access is // theoretically possible Files.delete(file); return FileVisitResult.CONTINUE; } @Override public FileVisitResult postVisitDirectory(Path dir, IOException e) throws IOException { if (e == null) { Files.delete(dir); return FileVisitResult.CONTINUE; } throw e; } }); } /** * Heuristically extracts the name of the Project from the path to a git repo. * * @param packFilePath * full path of a pack file, possibly prefixed with a protocol.<br> * Example: har://bender.unibe.ch/ant/.git/objects/pack/pack-389c04f6e54ffd737e8b4a7448d5a4d3374a7c29.pack */ public String extractProjectName(String packFilePath) { Matcher m = projectNameRegexNonBare.matcher(packFilePath); if (m.matches()) { return m.group(1); } m = projectNameRegexBare.matcher(packFilePath); if (m.matches()) { return m.group(1); } logger.warning("Could not simplify project name " + packFilePath); // Use URI as project name. return packFilePath; } @Override public void close() throws IOException { if (populator != null) { populator.close(); } } }