package focusedCrawler.seedfinder;
import java.io.PrintStream;
import focusedCrawler.crawler.async.HttpDownloaderConfig;
import focusedCrawler.crawler.async.fetcher.FetcherFactory;
import focusedCrawler.crawler.crawlercommons.fetcher.http.SimpleHttpFetcher;
import focusedCrawler.seedfinder.QueryProcessor.QueryResult;
import focusedCrawler.target.classifier.TargetClassifier;
import focusedCrawler.target.classifier.TargetClassifierFactory;
import focusedCrawler.target.model.Page;
import focusedCrawler.util.CliTool;
import io.airlift.airline.Command;
import io.airlift.airline.Option;
@Command(name="seedFinder", description="Runs the SeedFinder tool")
public class SeedFinder extends CliTool {
private final String userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.95 Safari/537.11";
enum SearchEngineType {
GOOGLE, BING, BING_API, ALL
}
@Option(name="--maxPages", description="Maximum number of pages per query")
private int maxPagesPerQuery = 2;
@Option(name="--minPrecision", description="Stops query pagination after precision drops bellow this minimum precision threshold")
private double minPrecision = 0.5;
@Option(name="--maxQueries", description="Max number of generated queries")
private int maxNumberOfQueries = 100;
@Option(name="--initialQuery", description="The inital query to issue to the search engine", required=true)
private String initialQuery;
@Option(name="--csvPath", description="The path where to write a CSV file with stats")
private String csvPath;
@Option(name="--modelPath", description="The path to the page classifier model", required=true)
private String modelPath;
@Option(name="--seedsPath", description="The path where the seeds generated should be saved")
private String seedsPath = "";
@Option(name="--searchEngine", description="The search engine to be used")
private SearchEngineType searchEngine = SearchEngineType.ALL;
public static void main(String[] args) {
CliTool.run(args, new SeedFinder());
}
@Override
public void execute() throws Exception {
SearchEngineApi api = createSearchEngineApi(this.searchEngine);
System.out.println("Search Engine: "+api.getClass().getSimpleName());
TargetClassifier classifier = TargetClassifierFactory.create(modelPath);
Query query = new Query(initialQuery);
QueryGenerator queryGenerator = new QueryGenerator(minPrecision);
QueryProcessor queryProcessor = new QueryProcessor(maxPagesPerQuery, minPrecision, classifier, api);
String seedFileName = (seedsPath.length() == 0) ? "seeds_" + query.asString() + ".txt" : seedsPath+"/seeds_" + query.asString() + ".txt";
PrintStream seedsFile = new PrintStream(seedFileName);
PrintStream csvFile = null;
if(csvPath != null && !csvPath.isEmpty()) {
csvFile = new PrintStream(csvPath);
}
try {
int numberOfQueries = 0;
while (numberOfQueries < maxNumberOfQueries) {
System.out.println("\n---------------");
System.out.println("Executing QUERY: "+query.asString());
System.out.println("---------------\n");
QueryResult result = queryProcessor.processQuery(query);
if(csvFile != null) {
writeResultsToLog(csvFile, query, result);
}
for (Page page : result.positivePages) {
seedsFile.println(page.getURL().toExternalForm());
}
System.out.println("\nBuilding next query...");
query = queryGenerator.buildNextQuery(query, result);
numberOfQueries++;
}
} finally {
queryProcessor.close();
seedsFile.close();
csvFile.close();
}
System.out.println("\nSeeds file created at: "+seedFileName);
}
private void writeResultsToLog(PrintStream out, Query query, QueryResult result) {
for(Page p : result.positivePages) {
out.printf("%s, %s, %s\n", query.asString(), "relevant", p.getURL().toString());
}
for(Page p : result.negativePages) {
out.printf("%s, %s, %s\n", query.asString(), "irrelevant", p.getURL().toString());
}
}
private SearchEngineApi createSearchEngineApi(SearchEngineType searchEngine) {
SimpleHttpFetcher fetcher = FetcherFactory.createSimpleHttpFetcher(new HttpDownloaderConfig());
fetcher.setUserAgentString(userAgent);
switch (searchEngine) {
case GOOGLE:
return new GoogleSearch(fetcher);
case BING:
return new BingSearch(fetcher);
case BING_API:
return new BingSearchAzureAPI();
case ALL:
return new SearchEnginePool(new BingSearch(fetcher), new GoogleSearch(fetcher));
}
return null;
}
}