package focusedCrawler.memex.cdr; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.zip.GZIPInputStream; import com.fasterxml.jackson.databind.ObjectMapper; import focusedCrawler.link.frontier.LinkRelevance; import focusedCrawler.util.CliTool; import io.airlift.airline.Command; import io.airlift.airline.Option; @Command(name="CountCdrTlds", description="Counts the number of unique TLDs in a CDR file") public class CountTlds extends CliTool { private static final ObjectMapper mapper = new ObjectMapper(); @Option(name="--input-path", required=true, description="Path to folder with multiple CDR files") private String inputPath; @Option(name="--output-file", required=true, description="Text file with TLD counts") private String outputFile; public static void main(String[] args) throws Exception { CliTool.run(args, new CountTlds()); } @Override public void execute() throws Exception { System.out.println("Reading CDR files from: "+inputPath); System.out.println("Generating CDR file at: "+outputFile); File inputFile = new File(inputPath); List<File> files; if(inputFile.isDirectory()) { files = Arrays.asList(inputFile.listFiles()); } else { files = Arrays.asList(inputFile); } int processedPages = 0; Map<String,Integer> tldCounts = new HashMap<String, Integer>(); for(File file : files) { System.out.printf("Processing file: %s\n", file.getCanonicalPath()); try(BufferedReader in = openGzipFile(file)) { String line; while((line = in.readLine()) != null) { CDR2Document doc = mapper.readValue(line, CDR2Document.class); LinkRelevance link = new LinkRelevance(doc.getUrl(), 0); String tld = link.getTopLevelDomainName(); Integer tldCount = tldCounts.get(tld); if(tldCount == null) { tldCount = new Integer(0); } tldCount++; tldCounts.put(tld, tldCount); processedPages++; if(processedPages % 1000 == 0) { System.out.printf("Counted %s pages...\n", processedPages); } } } } try(PrintWriter out = new PrintWriter(new FileOutputStream(outputFile), true)) { for (Entry<String, Integer> count : tldCounts.entrySet()) { out.printf("%s, %d\n", count.getKey(), count.getValue()); } } System.out.printf("Finished processing %d pages.\n", processedPages); } private BufferedReader openGzipFile(File file) throws IOException, FileNotFoundException { return new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file), 512*4096))); } }