package focusedCrawler.tools;
import java.io.PrintStream;
import focusedCrawler.target.classifier.TargetClassifier;
import focusedCrawler.target.classifier.TargetClassifierFactory;
import focusedCrawler.target.classifier.TargetRelevance;
import focusedCrawler.target.model.Page;
import focusedCrawler.target.model.ParsedData;
import focusedCrawler.target.model.TargetModelJson;
import focusedCrawler.target.repository.FileSystemTargetRepository;
import focusedCrawler.target.repository.FileSystemTargetRepository.DataFormat;
import focusedCrawler.target.repository.FileSystemTargetRepository.FileContentIterator;
import focusedCrawler.target.repository.FilesTargetRepository;
import focusedCrawler.target.repository.FilesTargetRepository.RepositoryIterator;
import focusedCrawler.util.CliTool;
import focusedCrawler.util.parser.PaginaURL;
import io.airlift.airline.Command;
import io.airlift.airline.Option;
@Command(name = "PrintClassifierProbabilitiesTargetRepository")
public class PrintClassifierProbabilitiesTargetRepository extends CliTool {
@Option(name = "--input-path", required = true, description = "Path to old input data_target folder")
private String inputPath;
@Option(name = "--output-file", required = false, description = "Path to output file containing URL-probabilities")
private String outputFile;
@Option(name = "--model-path", required = true, description = "The path to the target classifier to be used")
private String modelPath;
@Option(name = "--hash-file-name", required = false, description = "If the repository uses hashed file names")
private boolean hashFilename = true;
@Option(name = "--compressed-data", required = false, description = "If the repository uses compressed files")
private boolean compressData = true;
@Option(name = "--files-repository", required = false, description = "If the new FilesTargetRepository")
private boolean filesRepository = false;
private int count = 0;
public static void main(String[] args) throws Exception {
CliTool.run(args, new PrintClassifierProbabilitiesTargetRepository());
}
@Override
public void execute() throws Exception {
System.out.println("Reading URLs from file: " + inputPath);
System.out.println("Writing output file at: " + outputFile);
PrintStream out = System.out;
if(outputFile != null) {
out = new PrintStream(outputFile);
}
TargetClassifier classifier = TargetClassifierFactory.create(modelPath);
if(filesRepository) {
FilesTargetRepository repository = new FilesTargetRepository(inputPath);
try(RepositoryIterator it = repository.iterator()) {
while(it.hasNext()) {
TargetModelJson target = it.next();
printClassifierOutput(classifier, target, out);
}
}
} else {
FileSystemTargetRepository repository = new FileSystemTargetRepository(inputPath, DataFormat.JSON, hashFilename, compressData);
try(FileContentIterator<TargetModelJson> it = repository.iterator()) {
while(it.hasNext()) {
TargetModelJson target = it.next();
printClassifierOutput(classifier, target, out);
}
}
}
}
private void printClassifierOutput(TargetClassifier classifier, TargetModelJson target, PrintStream out) {
try {
Page page = new Page(target);
PaginaURL pageParser = new PaginaURL(page);
page.setParsedData(new ParsedData(pageParser));
TargetRelevance relevance = classifier.classify(page);
out.printf("%.6f %s\n", relevance.getRelevance(), target.getUrl());
count++;
if(count % 1000 == 0) {
System.out.printf("Processed %d files...\n", count);
}
} catch (Exception e) {
System.err.printf("Failed to process URL: %s", target.getUrl());
e.printStackTrace(System.err);
}
}
}