package focusedCrawler;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.Method;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import focusedCrawler.config.ConfigService;
import focusedCrawler.crawler.async.AsyncCrawler;
import focusedCrawler.crawler.async.AsyncCrawlerConfig;
import focusedCrawler.link.LinkStorage;
import focusedCrawler.link.frontier.FrontierManager;
import focusedCrawler.link.frontier.FrontierManagerFactory;
import focusedCrawler.link.frontier.FrontierPersistentException;
import focusedCrawler.rest.RestServer;
import focusedCrawler.seedfinder.SeedFinder;
import focusedCrawler.target.TargetStorage;
import focusedCrawler.target.classifier.WekaTargetClassifierBuilder;
import focusedCrawler.util.MetricsManager;
import focusedCrawler.util.storage.Storage;
import io.airlift.airline.Arguments;
import io.airlift.airline.Cli;
import io.airlift.airline.Cli.CliBuilder;
import io.airlift.airline.Command;
import io.airlift.airline.Help;
import io.airlift.airline.Option;
import io.airlift.airline.ParseException;
/**
* <p>
* Description: This is the main entry point for working with the components of ACHE
* </p>
*/
public class Main {
public static final String VERSION = Main.class.getPackage().getImplementationVersion();
public static final Logger logger = LoggerFactory.getLogger(Main.class);
public static void main(String... args) {
printVersion();
@SuppressWarnings("unchecked")
CliBuilder<Runnable> builder = Cli.<Runnable>builder("ache")
.withDescription("ACHE Focused Crawler")
.withDefaultCommand(AcheHelp.class)
.withCommands(
AcheHelp.class,
StartCrawl.class,
BuildModel.class,
AddSeeds.class,
StartLinkStorage.class,
StartCrawlManager.class,
SeedFinder.class,
RunCliTool.class
);
Cli<Runnable> acheParser = builder.build();
try {
acheParser.parse(args).run();
}
catch(ParseException e) {
System.out.println("Unable to parse the input. "+e.getMessage()+"\n");
Help.help(acheParser.getMetadata(), Arrays.asList());
System.exit(1);
}
catch (Exception e) {
System.err.println("Failed to execute command.");
e.printStackTrace(System.err);
}
}
private static void printVersion() {
String header = "ACHE Crawler "+VERSION;
for (int i = 0; i < header.length(); i++) {
System.out.print("-");
}
System.out.println();
System.out.println(header);
for (int i = 0; i < header.length(); i++) {
System.out.print("-");
}
System.out.println();
System.out.println();
}
public static class AcheHelp extends Help {
@Override
public void run() {
super.run();
if(command.isEmpty()) {
printExamples();
}
}
private static void printExamples() {
System.out.println("EXAMPLES\n");
System.out.println(" ache startCrawl -c config/sample_config -o data -s config/sample.seeds -m config/sample_model");
System.out.println(" ache buildModel -c config/sample_config/stoplist.txt -t training_data -o output_model");
System.out.println(" ache addSeeds -c config/sample_config -o data -s config/sample.seeds");
System.out.println(" ache startLinkStorage -c config/sample_config -o data -s config/sample.seeds");
System.out.println(" ache startTargetStorage -c config/sample_config -o data -m config/sample_model");
System.out.println(" ache startCrawlManager -c config/sample_config");
}
}
@Command(name = "run", description = "Run any available utilitary tool")
public static class RunCliTool implements Runnable {
@Arguments(description = "Tool to be executed followed by its parameters")
public List<String> args;
public void run() {
if(args == null || args.size() == 0) {
System.out.println("ERROR: Class name of command-line tool not specified.");
System.exit(1);
}
String toolClass = args.get(0);
Class<?> loadedClass = null;
try {
ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
try {
loadedClass = classLoader.loadClass("focusedCrawler.tools."+toolClass);
} catch(ClassNotFoundException e) {
// also try full class name
loadedClass = classLoader.loadClass(toolClass);
}
} catch (ClassNotFoundException e) {
System.out.println("Unable to find CLI tool named "+toolClass);
System.exit(1);
}
// Execute main() method of loaded class
String[] params = args.subList(1, args.size()).toArray(new String[args.size()-1]);
try {
Method mainMethod = loadedClass.getMethod("main", String[].class);
mainMethod.invoke(null, (Object) params);
} catch (Exception e) {
System.out.printf("Failed to run tool %s.\n\n", loadedClass.getName());
e.printStackTrace(System.out);
System.exit(1);
}
}
}
@Command(name = "buildModel", description = "Builds a model for a Weka target classifier")
public static class BuildModel implements Runnable {
@Option(name = {"-t", "--trainingDataDir"}, required = true, description = "Path to folder containing training data")
String trainingPath;
@Option(name = {"-o", "--outputDir"}, required = true, description = "Path to folder which model built should be stored")
String outputPath;
@Option(name = {"-c", "--stopWordsFile"}, required = false, description = "Path to stopwords file")
String stopWordsFile;
@Option(name = {"-l", "--learner"}, required = false, description = "Machine-learning algorithm to be used to train the model (SMO, RandomForest)")
String learner;
@Override
public void run() {
new File(outputPath).mkdirs();
// generate the input for weka
System.out.println("Preparing training data...");
WekaTargetClassifierBuilder.createInputFile(stopWordsFile, trainingPath, trainingPath + "/weka.arff" );
// generate the model
System.out.println("Training model...");
WekaTargetClassifierBuilder.trainModel(trainingPath, outputPath, learner);
// generate features file
System.out.println("Creating feature file...");
WekaTargetClassifierBuilder.createFeaturesFile(outputPath,trainingPath);
System.out.println("done.");
}
}
@Command(name = "addSeeds", description = "Add seeds used to bootstrap the crawler")
public static class AddSeeds implements Runnable {
@Option(name = {"-o", "--outputDir"}, required = true, description = "Path to a folder to store crawler data")
String dataOutputPath;
@Option(name = {"-c", "--configDir"}, required = true, description = "Path to configuration files folder")
String configPath;
@Option(name = {"-s", "--seed"}, required = true, description = "Path to file of seed URLs")
String seedPath;
public void run() {
ConfigService config = new ConfigService(Paths.get(configPath, "ache.yml").toString());
FrontierManager frontierManager = FrontierManagerFactory.create(config.getLinkStorageConfig(), configPath, dataOutputPath, seedPath, null);
frontierManager.close();
}
}
@Command(name = "startLinkStorage", description = "Starts a LinkStorage server")
public static class StartLinkStorage implements Runnable {
@Option(name = {"-o", "--outputDir"}, required = true, description = "Path to a folder to store link storage data")
String dataOutputPath;
@Option(name = {"-c", "--configDir"}, required = true, description = "Path to configuration files folder")
String configPath;
@Option(name = {"-m", "--model"}, required = true, description = "")
String modelPath;
@Option(name = {"-s", "--seed"}, required = false, description = "Path to the file containing seed URLs")
String seedPath;
public void run() {
try {
ConfigService config = new ConfigService(Paths.get(configPath, "ache.yml").toString());
LinkStorage.runServer(configPath, seedPath, dataOutputPath, modelPath, config.getLinkStorageConfig());
} catch (Throwable t) {
logger.error("Something bad happened to LinkStorage :(", t);
}
}
}
@Command(name = "startTargetStorage", description = "Starts a TargetStorage server")
public static class StartTargetStorage implements Runnable {
@Option(name = {"-c", "--config"}, required = true, description = "Path to configuration files folder")
String configPath;
@Option(name = {"-m", "--modelDir"}, required = true, description = "Path to folder containing page classifier model")
String modelPath;
@Option(name = {"-o", "--outputDir"}, required = true, description = "Path to folder which model built should be stored")
String dataOutputPath;
@Option(name = {"-e", "--elasticIndex"}, required = false, description = "Elasticsearch index name to be used")
String esIndexName;
@Option(name = {"-t", "--elasticType"}, required = false, description = "Elasticsearch type name to be used")
String esTypeName;
@Override
public void run() {
try {
ConfigService config = new ConfigService(Paths.get(configPath, "ache.yml").toString());
TargetStorage.runServer(configPath, modelPath, dataOutputPath, esIndexName, esTypeName, config);
} catch (Throwable t) {
logger.error("Something bad happened to TargetStorage :(", t);
}
}
}
@Command(name = "startCrawlManager", description = "Starts a LinkStorage server")
public static class StartCrawlManager implements Runnable {
@Option(name = {"-c", "--config"}, required = true, description = "Path to configuration files folder")
String configPath;
@Option(name = {"-o", "--outputDir"}, required = true, description = "Path to a folder to store crawl manager data")
String dataPath;
@Override
public void run() {
try {
ConfigService config = new ConfigService(Paths.get(configPath, "ache.yml").toString());
AsyncCrawler.run(config, dataPath);
} catch (Throwable t) {
logger.error("Something bad happened to CrawlManager :(", t);
}
}
}
@Command(name = "startCrawl", description = "Starts a crawler")
public static class StartCrawl implements Runnable {
@Option(name = {"-c", "--config"}, required = true, description = "Path to configuration files folder")
String configPath;
@Option(name = {"-m", "--modelDir"}, required = true, description = "Path to folder containing page classifier model")
String modelPath;
@Option(name = {"-o", "--outputDir"}, required = true, description = "Path to folder which model built should be stored")
String dataOutputPath;
@Option(name = {"-s", "--seed"}, required = true, description = "Path to file of seed URLs")
String seedPath;
@Option(name = {"-e", "--elasticIndex"}, required = false, description = "Name of Elasticsearch index to be used")
String esIndexName;
@Option(name = {"-t", "--elasticType"}, required = false, description = "Name of Elasticsearch document type to be used")
String esTypeName;
@Override
public void run() {
ConfigService config = new ConfigService(Paths.get(configPath, "ache.yml").toString());
try {
MetricsManager metricsManager = new MetricsManager();
RestServer restServer = RestServer.create(dataOutputPath,
metricsManager.getMetricsRegistry(), config, esIndexName, esTypeName);
restServer.start();
Storage linkStorage = LinkStorage.createLinkStorage(configPath, seedPath,
dataOutputPath, modelPath, config.getLinkStorageConfig(), metricsManager);
// start target storage
Storage targetStorage = TargetStorage.createTargetStorage(configPath, modelPath,
dataOutputPath, esIndexName, esTypeName,
config.getTargetStorageConfig(), linkStorage);
AsyncCrawlerConfig crawlerConfig = config.getCrawlerConfig();
// start crawl manager
AsyncCrawler crawler = new AsyncCrawler(targetStorage, linkStorage, crawlerConfig,
dataOutputPath, metricsManager);
try {
crawler.run();
} finally {
crawler.shutdown();
metricsManager.close();
restServer.shutdown();
}
}
catch (FrontierPersistentException e) {
logger.error("Problem while creating LinkStorage", e);
}
catch (IOException e) {
logger.error("Problem while starting crawler.", e);
}
catch (Throwable e) {
logger.error("Crawler execution failed.", e);
}
}
}
}