package ch.unibe.scg.cc; import java.io.Closeable; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.Set; import javax.inject.Inject; import ch.unibe.scg.cc.Annotations.Type1; import ch.unibe.scg.cc.Annotations.Type2; import ch.unibe.scg.cc.Protos.CloneType; import ch.unibe.scg.cc.Protos.CodeFile; import ch.unibe.scg.cc.Protos.Function; import ch.unibe.scg.cc.Protos.Project; import ch.unibe.scg.cc.Protos.Snippet; import ch.unibe.scg.cc.Protos.Version; import ch.unibe.scg.cc.lines.StringOfLines; import ch.unibe.scg.cc.lines.StringOfLinesFactory; import ch.unibe.scg.cells.Sink; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.google.common.collect.Iterables; import com.google.common.io.Closer; import com.google.protobuf.ByteString; /** * Populator populates the persistent tables of all code. It is used from a project tree walk. * * <b>NOT THREADSAFE.</b> */ public class Populator implements Closeable, Serializable { final static private long serialVersionUID = 1L; private static final int CACHE_SIZE = 1000000; static final int MINIMUM_LINES = 5; static final int MINIMUM_FRAME_SIZE = MINIMUM_LINES; final private Normalizer type1; final private Normalizer type2; final private Tokenizer tokenizer; final private StandardHasher standardHasher; final private Hasher shingleHasher; final private StringOfLinesFactory stringOfLinesFactory; // If you add a cellSink - REMEMBER TO ADD IT TO CLOSE! final private Sink<Project> projectSink; final private Sink<Version> versionSink; final private Sink<CodeFile> codeFileSink; final private Sink<Function> functionSink; final private Sink<Str<Function>> functionStringSink; /** Function2Snippet */ final private Sink<Snippet> snippetSink; /** Changes for every project */ private Sink<Snippet> snippet2Functions; /** Functions that were successfully written to DB in this mapper */ private transient Cache<ByteString, Boolean> writtenFunctions = CacheBuilder.newBuilder().maximumSize(CACHE_SIZE).build(); /** Files that were successfully written to DB in this mapper */ private transient Cache<ByteString, Boolean> writtenFiles = CacheBuilder.newBuilder().maximumSize(CACHE_SIZE).build(); @Inject Populator(StandardHasher standardHasher, ShingleHasher shingleHasher, @Type1 Normalizer type1, @Type2 Normalizer type2, Tokenizer tokenizer, StringOfLinesFactory stringOfLinesFactory, @Annotations.Populator Sink<Project> projectSink, @Annotations.Populator Sink<Version> versionSink, @Annotations.Populator Sink<CodeFile> codeFileSink, @Annotations.Populator Sink<Function> functionSink, @Annotations.Populator Sink<Snippet> snippetSink, @Annotations.Populator Sink<Str<Function>> functionStringSink) { this.standardHasher = standardHasher; this.shingleHasher = shingleHasher; this.type1 = type1; this.type2 = type2; this.tokenizer = tokenizer; this.stringOfLinesFactory = stringOfLinesFactory; this.projectSink = projectSink; this.versionSink = versionSink; this.codeFileSink = codeFileSink; this.functionSink = functionSink; this.snippetSink = snippetSink; this.functionStringSink = functionStringSink; } private void readObject(java.io.ObjectInputStream stream) throws IOException, ClassNotFoundException { stream.defaultReadObject(); writtenFunctions = CacheBuilder.newBuilder().maximumSize(CACHE_SIZE).build(); writtenFiles = CacheBuilder.newBuilder().maximumSize(CACHE_SIZE).build(); } /** Register all Versions of a Project */ public class ProjectRegistrar implements AutoCloseable { final private Project.Builder project; /** Separate from project, because we're keeping builders */ final private Collection<Version.Builder> versions = new ArrayList<>(); ProjectRegistrar(String projectName) { project = Project.newBuilder().setName(projectName); } @Override public void close() throws IOException, InterruptedException { snippet2Functions = null; if (versions.isEmpty()) { return; } Set<ByteString> hs = new HashSet<>(); for (Version.Builder v : versions) { hs.add(v.getHash()); } project.setHash(xor(hs)); projectSink.write(project.build()); for (Version.Builder v : versions) { v.setProject(project.getHash()); versionSink.write(v.build()); } } /** @return a new VersionRegistrar. */ public VersionRegistrar makeVersionRegistrar(String versionName) { return new VersionRegistrar(this, versionName); } void register(Version.Builder v) { versions.add(v); } } /** Register all CodeFiles of a Version */ public class VersionRegistrar implements AutoCloseable { final private ProjectRegistrar projectRegistrar; /** Separate from version because we're storing the builders. */ final private Collection<CodeFile.Builder> files = new ArrayList<>(); final private Version.Builder version; VersionRegistrar(ProjectRegistrar projectRegistrar, String versionName) { this.projectRegistrar = projectRegistrar; version = Version.newBuilder().setName(versionName); } @Override public void close() throws IOException, InterruptedException { if (files.isEmpty()) { return; } Set<ByteString> fileHashes = new HashSet<>(); for (CodeFile.Builder fil : files) { fileHashes.add(fil.getHash()); } version.setHash(xor(fileHashes)); projectRegistrar.register(version); for (CodeFile.Builder fil : files) { fil.setVersion(version.getHash()); codeFileSink.write(fil.build()); } } /** @return a new FileRegistrar. */ public FileRegistrar makeFileRegistrar() { return new FileRegistrar(this); } void register(CodeFile.Builder fil) { files.add(fil); } } /** Register all Functions of a CodeFile */ public class FileRegistrar { final private VersionRegistrar versionRegistrar; FileRegistrar(VersionRegistrar versionRegistrar) { this.versionRegistrar = versionRegistrar; } /** Registers all functions and snippets in {@code contents}. */ public void register(String path, String contents) throws IOException, InterruptedException { CodeFile.Builder fil = CodeFile.newBuilder() .setPath(path) .setContents(contents) .setHash(ByteString.copyFrom(standardHasher.hash(contents))); versionRegistrar.register(fil); if (writtenFiles.getIfPresent(fil.getHash()) != null) { return; } for (Function fun : tokenizer.tokenize(contents)) { // type-1 StringBuilder c = new StringBuilder(fun.getContents()); type1.normalize(c); String normalized = c.toString(); if (Utils.countLines(normalized) < MINIMUM_LINES) { continue; } // TODO: Should this be part of the tokenizer? fun = Function.newBuilder(fun).setHash(ByteString.copyFrom(standardHasher.hash(fun.getContents()))) .setCodeFile(fil.getHash()).build(); functionStringSink.write(new Str<Function>(fun.getHash(), fun.getContents())); functionSink.write(fun); if (writtenFunctions.getIfPresent(fun.getHash()) != null) { return; } registerSnippets(fun, normalized, CloneType.LITERAL); // type-2 type2.normalize(c); normalized = c.toString(); registerSnippets(fun, normalized, CloneType.RENAMED); // type-3 registerSnippets(fun, normalized, CloneType.GAPPED); } } } /** * @param newSnippet2Functions * The sink for snippet2Function to be used while this * projectRegistrar is active. * @return a new ProjectRegistrar. */ public ProjectRegistrar makeProjectRegistrar(String projectName, Sink<Snippet> newSnippet2Functions) { this.snippet2Functions = newSnippet2Functions; return new ProjectRegistrar(projectName); } @Override public void close() throws IOException { try(Closer closer = Closer.create()) { closer.register(snippetSink); closer.register(functionSink); closer.register(codeFileSink); closer.register(versionSink); closer.register(projectSink); closer.register(functionStringSink); if (snippet2Functions != null) { closer.register(snippet2Functions); } } } private void registerSnippets(Protos.Function fun, String normalized, CloneType type) throws IOException, InterruptedException { StringOfLines s = stringOfLinesFactory.make(normalized, '\n'); Hasher hasher = standardHasher; if (type.equals(CloneType.GAPPED)) { hasher = shingleHasher; } for (int frameStart = 0; frameStart + MINIMUM_LINES <= s.getNumberOfLines(); frameStart++) { byte[] hash; try { hash = hasher.hash(s.getLines(frameStart, MINIMUM_LINES)); } catch (CannotBeHashedException e) { // TODO: cannotBeHashedCounter.increment(1); continue; } Snippet snip = Protos.Snippet.newBuilder() .setFunction(fun.getHash()) .setLength(MINIMUM_LINES) .setPosition(frameStart) .setHash(ByteString.copyFrom(hash)) .setCloneType(type) .build(); snippetSink.write(snip); snippet2Functions.write(snip); } } static ByteString xor(Iterable<ByteString> hashes) { assert !Iterables.isEmpty(hashes) : "You asked me to xor an empty iterable."; byte[] ret = new byte[Iterables.getFirst(hashes, null).size()]; for (ByteString h : hashes) { Utils.xor(ret, h.toByteArray()); } return ByteString.copyFrom(ret); } }