package focusedCrawler.tools; import java.io.IOException; import java.io.PrintStream; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.HashMap; import java.util.List; import com.fasterxml.jackson.databind.ObjectMapper; import focusedCrawler.memex.cdr.CDR2Document; import focusedCrawler.target.model.TargetModelJson; public class CrawlerEvalFile { static final ObjectMapper jsonMapper = new ObjectMapper(); public static void main(String[] args) throws IOException { Path path = Paths.get("/data/memex/crawleval/onion"); DirectoryStream<Path> basePathStream = Files.newDirectoryStream(path); for (Path outputPath : basePathStream) { Path dataPath = outputPath.resolve("data_target"); for(Path site : Files.newDirectoryStream(dataPath)) { System.out.println(outputPath.getFileName() + " - "+ site.getFileName()); indexFolder(site, site.getFileName()+"_NYU.json"); } } } private static void indexFolder(Path inputPath, String filename) throws IOException { PrintStream fileWriter = new PrintStream(filename); DirectoryStream<Path> fileStream = Files.newDirectoryStream(inputPath); for (Path filePath : fileStream) { final byte[] bytes = Files.readAllBytes(filePath); TargetModelJson pageModel = jsonMapper.readValue(bytes, TargetModelJson.class); if(pageModel == null) { continue; } List<String> contentTypeHeader = pageModel.getResponseHeaders().get("Content-Type"); if(contentTypeHeader == null) { contentTypeHeader = pageModel.getResponseHeaders().get("content-type"); } if(contentTypeHeader == null || contentTypeHeader.size() == 0) { continue; } String contentType = contentTypeHeader.iterator().next(); // if(!contentType.contains("text/html")) { // System.out.println("Discarding "+ pageModel.getUrl()+ " due to content-type: "+contentType); // continue; // } HashMap<String, Object> crawlData = new HashMap<>(); crawlData.put("response_headers", pageModel.getResponseHeaders()); String doc = new CDR2Document.Builder() .setUrl(pageModel.getUrl()) .setTimestamp(pageModel.getFetchTime()) .setContentType(contentType) .setTeam("NYU") .setCrawler("ACHE") .setRawContent(pageModel.getContentAsString()) .setCrawlData(crawlData) .buildAsJson(); fileWriter.println(doc); } fileWriter.close(); } }