package focusedCrawler.tools;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.PrintWriter;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import focusedCrawler.link.frontier.LinkRelevance;
import focusedCrawler.util.CliTool;
import io.airlift.airline.Command;
import io.airlift.airline.Option;
@Command(name="CountTlds", description="Counts the number of unique TLDs in a text file containing one URL per line")
public class CountTlds extends CliTool {
@Option(name = "--input-file",
required = true,
description = "Path to file containing one URL per line")
private String inputPath;
@Option(name = "--output-file",
required = true,
description = "Text file with TLD counts")
private String outputFile;
public static void main(String[] args) throws Exception {
CliTool.run(args, new CountTlds());
}
@Override
public void execute() throws Exception {
System.out.println("Reading URLs from file: " + inputPath);
System.out.println("Writing output file at: " + outputFile);
int processedPages = 0;
Map<String, Integer> tldCounts = new HashMap<String, Integer>();
try (BufferedReader br = new BufferedReader(new FileReader(inputPath))) {
String line;
while ((line = br.readLine()) != null) {
String tld = new LinkRelevance(line, 0).getTopLevelDomainName();
Integer tldCount = tldCounts.get(tld);
if (tldCount == null) {
tldCount = new Integer(0);
}
tldCount++;
tldCounts.put(tld, tldCount);
processedPages++;
if (processedPages % 1000 == 0) {
System.out.printf("Counted %s pages...\n", processedPages);
}
}
}
try (PrintWriter out = new PrintWriter(new FileOutputStream(outputFile), true)) {
for (Entry<String, Integer> count : tldCounts.entrySet()) {
out.printf("%s %d\n", count.getKey(), count.getValue());
}
}
System.out.printf("Finished processing %d pages.\n", processedPages);
}
}