package focusedCrawler.tools;
import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.nio.file.Files;
import java.nio.file.Paths;
import focusedCrawler.target.classifier.TargetClassifier;
import focusedCrawler.target.classifier.TargetClassifierFactory;
import focusedCrawler.target.classifier.TargetRelevance;
import focusedCrawler.target.model.Page;
import focusedCrawler.target.model.ParsedData;
import focusedCrawler.util.CliTool;
import focusedCrawler.util.parser.PaginaURL;
import io.airlift.airline.Command;
import io.airlift.airline.Option;
@Command(name="TargetClassifierTester", description="Classifies a pages using a given classifier")
public class TargetClassifierTester extends CliTool {
@Option(name = "--input-file",
required = true,
description = "Path to file a file containing an HTML page (name of file should be a valid URL)")
private String inputPath;
@Option(name = "--model",
required = true,
description = "A path to the target classifier model")
private String model;
public static void main(String[] args) throws Exception {
CliTool.run(args, new TargetClassifierTester());
}
@Override
public void execute() throws Exception {
System.out.println("Reading page content from file: " + inputPath);
System.out.println(" Loading classifier from: " + model);
String content = new String(Files.readAllBytes(Paths.get(inputPath)), "UTF-8");
String url = "http://";
try {
String filename = new File(inputPath).getName();
url = new URL(URLDecoder.decode(filename, "UTF-8")).toString();
} catch (MalformedURLException e) {
System.out.println("File name not recognized as valid URL.");
}
Page page = createPage(url, content);
TargetClassifier classifier = TargetClassifierFactory.create(model);
TargetRelevance result = classifier.classify(page);
String label = result.isRelevant() ? "Relevant" : "Irrelevant";
System.out.println("-------------------------");
System.out.println(" Classified as: "+ label);
System.out.println("Classification confidence: "+ String.format("%.4f",result.getRelevance()));
System.out.println("-------------------------");
}
private Page createPage(String urlStr, String cont) throws MalformedURLException {
URL url = new URL(urlStr);
Page page1 = new Page(url, cont);
page1.setParsedData(new ParsedData(new PaginaURL(page1)));
return page1;
}
}