package com.cadrlife.devsearch.agent.indexing;
import com.cadrlife.devsearch.agent.service.analysis.FileProcessor;
import com.cadrlife.devsearch.domain.IdUtil;
import com.cadrlife.devsearch.domain.Project;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import org.apache.commons.io.DirectoryWalker;
import org.apache.commons.lang.StringUtils;
import org.elasticsearch.action.bulk.BulkProcessor;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.Client;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
public class ProjectWalker extends DirectoryWalker<Void> {
private static final String[] FILE_PREFIXES_TO_SKIP = new String[]{".#"};
private static final String[] DIRECTORIES_TO_SKIP = new String[]{".git",".bin",".settings", ".idea","CVS",".gradle"};
private static final String[] FORBIDDEN_FILE_TYPES = new String[]{".jpi",".gif",".png",".jar",".war",".ear",".class",".jpg",".bmp",".exe",".dmg",".zip",".gz",".tar",".bz2",".ico",".psd",".swf",".vsd",".ppt",".min.js",".min.css",".dll"};
private static final Logger LOG = LoggerFactory.getLogger(ProjectWalker.class);
private static final long MAX_FILE_SIZE_BYTES = 300 * 1024; // 300K
private final Path rootPath;
private final Project project;
private final String codeIndex;
private final Client esClient;
private final BulkProcessor bulkProcessor;
private final FileProcessor fileProcessor;
public ProjectWalker(Path rootPath, Project project, String codeIndex, Client esClient, BulkProcessor bulkProcessor, FileProcessor fileProcessor) {
this.rootPath = rootPath;
this.project = project;
this.codeIndex = codeIndex;
this.esClient = esClient;
this.bulkProcessor = bulkProcessor;
this.fileProcessor = fileProcessor;
}
public void walkProject() throws IOException {
Preconditions.checkNotNull("Last indexed date must be updated before walk", project.getLastIndexed());
walk(project.getCheckoutPath().toFile(), new ArrayList<Void>());
}
@Override
protected boolean handleDirectory(File directory, int depth, Collection<Void> results) throws IOException {
return !endsWithAny(directory.toPath(), DIRECTORIES_TO_SKIP);
}
@Override
protected void handleFile(File file, int depth, Collection<Void> results) throws IOException {
String fileName = file.getName();
if (StringUtils.startsWithAny(fileName, FILE_PREFIXES_TO_SKIP)) {
return;
}
if (fileName.contains(".#")) {
return;
}
indexFile(file.toPath());
}
private void indexFile(Path aFile) {
LOG.debug("Index file {} to {} relativized as {}", aFile, codeIndex, rootPath.relativize(aFile));
bulkProcessor.add(buildFileIndexRequest(aFile));
}
private boolean endsWithAny(Path path, String...extensions) {
String lastName = path.getName(path.getNameCount()-1).toString();
for (String ext : extensions) {
if (lastName.endsWith(ext)) {
return true;
}
}
return false;
}
private IndexRequest buildFileIndexRequest(Path aFile) {
Path relative = rootPath.relativize(aFile);
Path filePath = relative.subpath(2, relative.getNameCount());
String filePathString = renderPath(filePath);
String id = IdUtil.docId(project.getRepo(), project.getName(), filePathString);
String filename = filePath.getFileName().toString();
String extension = com.google.common.io.Files.getFileExtension(filename);
String baseFilename = com.google.common.io.Files.getNameWithoutExtension(filename);
ImmutableMap.Builder<String, Object> mapBuilder = ImmutableMap.<String, Object>builder()
.put("repo", project.getRepo())
.put("project", project.getName())
.put("filePath", filePathString)
.put("baseFilename", baseFilename)
.put("extension", extension)
.put("lastIndexed", project.getLastIndexed());
File fileAsFile = aFile.toFile();
if (fileAsFile.isDirectory()) {
// On OSX this happens. RM 02/17/2013
LOG.error("Directory being called as file. JDK Bug? {}", aFile);
mapBuilder.put("content", "Not Indexed - Directory");
mapBuilder.put("crawlError", "Directory Processed as file");
} else if (endsWithAny(aFile, FORBIDDEN_FILE_TYPES)) {
mapBuilder.put("content", "Not Indexed - Forbidden filetype");
} else if (fileAsFile.length() > MAX_FILE_SIZE_BYTES) {
LOG.warn("Too big to index {}", aFile);
mapBuilder.put("content", "Not Indexed - Too big to index");
mapBuilder.put("crawlError", "Too big to index");
} else {
try {
String content = new String(Files.readAllBytes(aFile));
fileProcessor.addSourceAnalysisFields(mapBuilder, filePath, content);
mapBuilder.put("content", content);
} catch (IOException e) {
LOG.error("Trouble reading file", e);
mapBuilder.put("content", "Not Indexed - blew up reading file");
mapBuilder.put("crawlError", "Blew up reading file: {}" + e.getMessage());
}
}
return esClient.prepareIndex(codeIndex, LocalRepoCrawler.DOC_TYPE, id).setSource(mapBuilder.build()).request();
}
private String renderPath(Path path) {
// Should use forward slash on all platforms.
return Joiner.on("/").join(path);
}
// ;
// bu
//
// }
}