package focusedCrawler.tools;
import java.io.FileOutputStream;
import java.io.PrintWriter;
import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import com.google.common.hash.HashCode;
import com.google.common.hash.Hashing;
import focusedCrawler.target.model.TargetModelJson;
import focusedCrawler.target.repository.FilesTargetRepository;
import focusedCrawler.target.repository.FilesTargetRepository.RepositoryIterator;
import focusedCrawler.util.CliTool;
import io.airlift.airline.Command;
import io.airlift.airline.Option;
@Command(name="DuplicateDetector", description="Counts duplicates and create deduplicated repository")
public class DuplicateDetector extends CliTool {
@Option(name = "--input-path",
required = true,
description = "Path to directory containing a FILES repository")
private String inputPath;
@Option(name = "--output-file",
required = true,
description = "Text file containing duplicate statistics per TLD")
private String outputFile;
@Option(name = {"--deduped-path"},
description = "A new FILES repository contaning only unique pages")
private String dedupedRepositoryPath;
public static void main(String[] args) throws Exception {
CliTool.run(args, new DuplicateDetector());
}
@Override
public void execute() throws Exception {
System.out.println("Reading URLs from file: " + inputPath);
System.out.println("Writing statistics output file at: " + outputFile);
FilesTargetRepository repository = new FilesTargetRepository(inputPath);
FilesTargetRepository dedupRepository = null;
if(dedupedRepositoryPath != null && !dedupedRepositoryPath.isEmpty()) {
dedupRepository = new FilesTargetRepository(dedupedRepositoryPath);
System.out.println("Writing deduped repository at: " + outputFile);
}
int totalPages = 0;
int dupPages = 0;
int uniqPages = 0;
Set<String> seen = new HashSet<>();
Map<String, Integer> totalCounts = new HashMap<>();
Map<String, Integer> dupCounts = new HashMap<>();
RepositoryIterator it = repository.iterator();
while(it.hasNext()) {
TargetModelJson page = it.next();
String host = new URL(page.getUrl()).getHost();
HashCode code = Hashing.sha1().hashBytes(page.getContent());
String fingerprint = code.toString();
if(seen.contains(fingerprint)) {
Integer dupCount = dupCounts.get(host);
if(dupCount == null) {
dupCount = 0;
}
dupCount++;
dupCounts.put(host, dupCount);
dupPages++;
} else {
if(dedupRepository != null) {
dedupRepository.insert(page);
}
seen.add(fingerprint);
uniqPages++;
}
Integer totalCount = totalCounts.get(host);
if(totalCount == null) {
totalCount = 0;
}
totalCount++;
totalCounts.put(host, totalCount);
totalPages++;
if (totalPages % 1000 == 0) {
System.out.printf("Processed %s pages...\n", totalPages);
}
}
repository.close();
if(dedupRepository != null) {
dedupRepository.close();
}
System.out.printf("Finished processing %d pages (%d unique, %d duplicates, %.2f%%).\n", totalPages, uniqPages, dupPages, dupPages/(double)totalPages);
System.out.println("Printing statistics file...");
try (PrintWriter out = new PrintWriter(new FileOutputStream(outputFile), true)) {
for (Entry<String, Integer> hostCount : totalCounts.entrySet()) {
Integer d = dupCounts.get(hostCount.getKey());
int dups = d == null ? 0 : d;
int total = hostCount.getValue();
out.printf("%s %d %d %.2f\n", hostCount.getKey(), total, dups, dups/(double)total);
}
}
System.out.println("done.");
}
}