package focusedCrawler.memex.cdr; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.net.URL; import java.util.Arrays; import java.util.HashSet; import java.util.Iterator; import java.util.concurrent.atomic.AtomicInteger; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; import org.apache.commons.codec.digest.DigestUtils; import com.fasterxml.jackson.databind.ObjectMapper; import focusedCrawler.target.classifier.TargetClassifier; import focusedCrawler.target.classifier.TargetClassifierFactory; import focusedCrawler.target.classifier.TargetRelevance; import focusedCrawler.target.model.Page; import focusedCrawler.target.model.ParsedData; import focusedCrawler.util.CliTool; import focusedCrawler.util.parser.PaginaURL; import io.airlift.airline.Command; import io.airlift.airline.Option; @Command(name="MergeCdrFiles", description="Merges multiple CDR files into one") public class MergeCdrFiles extends CliTool { @Option(name="--input-path", description="Path to folder with multiple CDR files", required=true) private String inputPath; @Option(name="--output-file", description="Gziped output file containing data formmated as per CDR 2.0 schema", required=true) private String outputFile; @Option(name="--modelPath", description="Model path to filter pages out") private String modelPath; @Option(name="--dedup", description="Whether merge shoud filter duplications") private boolean dedup; private static final ObjectMapper mapper = new ObjectMapper(); private TargetClassifier classifier; private PrintWriter out; private AtomicInteger processedPages = new AtomicInteger(0); private AtomicInteger discardedPages = new AtomicInteger(0); private HashSet<String> uniqueSet = new HashSet<>(); private BufferedReader in; private Iterator<File> files; public static void main(String[] args) throws Exception { CliTool.run(args, new MergeCdrFiles()); } public void execute() throws Exception { System.out.println("Reading CDR files from: "+inputPath); System.out.println("Generating CDR file at: "+outputFile); System.out.println(" Target model path: "+modelPath); File file = new File(inputPath); if(file.isDirectory()) { files = Arrays.asList(file.listFiles()).iterator(); } else { files = Arrays.asList(file).iterator(); } classifier = TargetClassifierFactory.create(modelPath); in = openGzipFile(files.next()); out = new PrintWriter(new GZIPOutputStream(new FileOutputStream(outputFile)), true); int threadNumber = Runtime.getRuntime().availableProcessors(); Thread[] threads = new Thread[threadNumber]; for (int i = 0; i < threadNumber; i++) { LineClassifier t = new LineClassifier(); t.start(); threads[i] = t; } for (int i = 0; i < threadNumber; i++) { threads[i].join(); } in.close(); out.close(); System.out.printf("%d discarded out of %d files processed.\n", discardedPages.intValue(), processedPages.intValue()); } class LineClassifier extends Thread { @Override public void run() { String line = readLine(); while(line != null) { try { CDR2Document doc = mapper.readValue(line, CDR2Document.class); boolean discard = true; String hash = hashDocument(doc); if(!uniqueSet.contains(hash)) { uniqueSet.add(hash); TargetRelevance relevance = classify(doc); if (relevance.isRelevant()) { synchronized (out) { discard = false; } } } if(discard) { discardedPages .incrementAndGet(); } else { out.println(line); } int count = processedPages.incrementAndGet(); if (count % 100 == 0) { System.out.printf("%d discarded pages out of %d processed pages\n", discardedPages.intValue(), count); } } catch (Exception e) { new Exception("Failed to classify page.", e).printStackTrace(); } line = readLine(); } } } private String readLine() { try { String line = in.readLine(); if(line == null) { while(line == null && files.hasNext()) { File file = files.next(); System.out.println("Opening file: "+file.getCanonicalPath()); in = openGzipFile(file); line = in.readLine(); } } return line; } catch (IOException e) { new Exception("Failed to read next line.", e).printStackTrace(); return null; } } private BufferedReader openGzipFile(File file) throws IOException, FileNotFoundException { return new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file), 512*4096))); } private TargetRelevance classify(CDR2Document doc) throws Exception { Page page = new Page(new URL(doc.getUrl()), new String(doc.getRawContent())); PaginaURL pageParser = new PaginaURL(page); page.setParsedData(new ParsedData(pageParser)); TargetRelevance relevance = classifier.classify(page); int count = processedPages.intValue(); if(count % 100 == 0) { System.out.printf("%d %.3f %s\n", count, relevance.getRelevance(), doc.getUrl()); } return relevance; } private String hashDocument(CDR2Document doc) { String url = doc.getUrl(); url = url.replaceFirst("https?://", ""); if (url.endsWith("/")) { url = url.substring(0, url.length() - 1); } String contentHash = DigestUtils.sha1Hex(doc.getRawContent()); String urlHash= DigestUtils.sha1Hex(doc.getUrl()); return DigestUtils.md5Hex(urlHash+contentHash); } }