package com.cadrlife.devsearch.agent.indexing; import com.cadrlife.devsearch.agent.service.analysis.FileProcessor; import com.cadrlife.devsearch.domain.IdUtil; import com.cadrlife.devsearch.domain.Project; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableMap; import org.apache.commons.io.DirectoryWalker; import org.apache.commons.lang.StringUtils; import org.elasticsearch.action.bulk.BulkProcessor; import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.client.Client; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.Collection; public class ProjectWalker extends DirectoryWalker<Void> { private static final String[] FILE_PREFIXES_TO_SKIP = new String[]{".#"}; private static final String[] DIRECTORIES_TO_SKIP = new String[]{".git",".bin",".settings", ".idea","CVS",".gradle"}; private static final String[] FORBIDDEN_FILE_TYPES = new String[]{".jpi",".gif",".png",".jar",".war",".ear",".class",".jpg",".bmp",".exe",".dmg",".zip",".gz",".tar",".bz2",".ico",".psd",".swf",".vsd",".ppt",".min.js",".min.css",".dll"}; private static final Logger LOG = LoggerFactory.getLogger(ProjectWalker.class); private static final long MAX_FILE_SIZE_BYTES = 300 * 1024; // 300K private final Path rootPath; private final Project project; private final String codeIndex; private final Client esClient; private final BulkProcessor bulkProcessor; private final FileProcessor fileProcessor; public ProjectWalker(Path rootPath, Project project, String codeIndex, Client esClient, BulkProcessor bulkProcessor, FileProcessor fileProcessor) { this.rootPath = rootPath; this.project = project; this.codeIndex = codeIndex; this.esClient = esClient; this.bulkProcessor = bulkProcessor; this.fileProcessor = fileProcessor; } public void walkProject() throws IOException { Preconditions.checkNotNull("Last indexed date must be updated before walk", project.getLastIndexed()); walk(project.getCheckoutPath().toFile(), new ArrayList<Void>()); } @Override protected boolean handleDirectory(File directory, int depth, Collection<Void> results) throws IOException { return !endsWithAny(directory.toPath(), DIRECTORIES_TO_SKIP); } @Override protected void handleFile(File file, int depth, Collection<Void> results) throws IOException { String fileName = file.getName(); if (StringUtils.startsWithAny(fileName, FILE_PREFIXES_TO_SKIP)) { return; } if (fileName.contains(".#")) { return; } indexFile(file.toPath()); } private void indexFile(Path aFile) { LOG.debug("Index file {} to {} relativized as {}", aFile, codeIndex, rootPath.relativize(aFile)); bulkProcessor.add(buildFileIndexRequest(aFile)); } private boolean endsWithAny(Path path, String...extensions) { String lastName = path.getName(path.getNameCount()-1).toString(); for (String ext : extensions) { if (lastName.endsWith(ext)) { return true; } } return false; } private IndexRequest buildFileIndexRequest(Path aFile) { Path relative = rootPath.relativize(aFile); Path filePath = relative.subpath(2, relative.getNameCount()); String filePathString = renderPath(filePath); String id = IdUtil.docId(project.getRepo(), project.getName(), filePathString); String filename = filePath.getFileName().toString(); String extension = com.google.common.io.Files.getFileExtension(filename); String baseFilename = com.google.common.io.Files.getNameWithoutExtension(filename); ImmutableMap.Builder<String, Object> mapBuilder = ImmutableMap.<String, Object>builder() .put("repo", project.getRepo()) .put("project", project.getName()) .put("filePath", filePathString) .put("baseFilename", baseFilename) .put("extension", extension) .put("lastIndexed", project.getLastIndexed()); File fileAsFile = aFile.toFile(); if (fileAsFile.isDirectory()) { // On OSX this happens. RM 02/17/2013 LOG.error("Directory being called as file. JDK Bug? {}", aFile); mapBuilder.put("content", "Not Indexed - Directory"); mapBuilder.put("crawlError", "Directory Processed as file"); } else if (endsWithAny(aFile, FORBIDDEN_FILE_TYPES)) { mapBuilder.put("content", "Not Indexed - Forbidden filetype"); } else if (fileAsFile.length() > MAX_FILE_SIZE_BYTES) { LOG.warn("Too big to index {}", aFile); mapBuilder.put("content", "Not Indexed - Too big to index"); mapBuilder.put("crawlError", "Too big to index"); } else { try { String content = new String(Files.readAllBytes(aFile)); fileProcessor.addSourceAnalysisFields(mapBuilder, filePath, content); mapBuilder.put("content", content); } catch (IOException e) { LOG.error("Trouble reading file", e); mapBuilder.put("content", "Not Indexed - blew up reading file"); mapBuilder.put("crawlError", "Blew up reading file: {}" + e.getMessage()); } } return esClient.prepareIndex(codeIndex, LocalRepoCrawler.DOC_TYPE, id).setSource(mapBuilder.build()).request(); } private String renderPath(Path path) { // Should use forward slash on all platforms. return Joiner.on("/").join(path); } // ; // bu // // } }