package io.monokkel;
import akka.actor.ActorRef;
import akka.actor.ActorSelection;
import akka.actor.ActorSystem;
import akka.actor.PoisonPill;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import io.monokkel.actors.CoordinatorActor;
import io.monokkel.configuration.CrawlerConfiguration;
import io.monokkel.configuration.CrawlerConfigurationBuilder;
import io.monokkel.configuration.CrawlerWireObject;
import io.monokkel.core.*;
import io.monokkel.exceptions.FatalFault;
import io.monokkel.factories.ClientFactory;
import io.monokkel.factories.HttpClient4Builder;
import io.monokkel.messages.StartSystem;
import org.apache.commons.cli.*;
import org.apache.commons.configuration.Configuration;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import static io.monokkel.configuration.CrawlerConfiguration.COORDINATION_ACTOR;
import static java.lang.String.format;
import static scala.concurrent.duration.Duration.create;
public class Crawler {
public static final String CRAWLER_SYSTEM = "crawlerSystem";
private static Logger log = LoggerFactory.getLogger(Crawler.class.getClass());
private ActorSystem actorSystem;
public static void main(String[] args) {
Crawler crawler = new Crawler();
try {
ProgramArguments programArguments = parseProgramArguments(args);
final String configurationPath = programArguments.getConfigurationPath();
CrawlerConfiguration crawlerConfiguration = parseAndGetConfigurationFromFolder(configurationPath, programArguments);
CrawlerWireObject crawlerWireObject = instanceAndPopulateWireObject(crawlerConfiguration);
final Thread mainThread = Thread.currentThread();
Runtime.getRuntime().addShutdownHook(new Thread() {
public void run() {
crawler.killCrawler();
try {
mainThread.join();
} catch (InterruptedException e) {
log.warn("Shutdown process interrupted", e);
}
}
});
crawler.executeCrawler(crawlerConfiguration, crawlerWireObject);
} catch (FatalFault e) {
log.error("Crawler terminated with an error exit 1", e);
System.exit(1);
}
}
@SuppressWarnings("unchecked")
private static CrawlerWireObject instanceAndPopulateWireObject(CrawlerConfiguration crawlerConfiguration) throws FatalFault {
final HttpClient4Builder httpClient4Builder = new HttpClient4Builder();
final UrlVisitor urlvisitor = new UrlVisitor(httpClient4Builder);
final HashMap transformationMap = readTransformationFile(crawlerConfiguration.getTransformationPath());
final HtmlTransformer htmlTransformer = new HtmlTransformer(crawlerConfiguration.getFilters(), crawlerConfiguration.getAttributeToLocateContent(), crawlerConfiguration.getContentRetrievalExpression(), transformationMap);
final ClientFactory clientFactory = new ClientFactory(crawlerConfiguration.getIndexNodeHost(), crawlerConfiguration.getIndexNodePort());
final DataCleaner dataCleaner = new DataCleaner(readTransformationFile(crawlerConfiguration.getCleanDataFilePath()));
JsonSupport jsonTransformer;
final List<String> fieldToFiendNextUrl = crawlerConfiguration.getFieldToFiendNextUrl();
if (crawlerConfiguration.isJsonTransformationEnabled()) {
final Map<String, List<String>> hashMap = readTransformationFile(crawlerConfiguration.getJsonTransformationPath());
jsonTransformer = new JsonTransformer(hashMap, fieldToFiendNextUrl);
} else {
jsonTransformer = new NoOperationsJsonParser(fieldToFiendNextUrl);
}
String indexName = crawlerConfiguration.getIndexName();
String indexType = crawlerConfiguration.getIndexType();
Indexer indexer = new Indexer(indexName, indexType, clientFactory, crawlerConfiguration.isShouldIndexRawContent());
DataValidator dataValidator = new DataValidator(readTransformationFile(crawlerConfiguration.getValidatorPath()));
return new CrawlerWireObject(urlvisitor, htmlTransformer, indexer, dataCleaner, dataValidator, jsonTransformer);
}
private static HashMap readTransformationFile(final String transformationPath) throws FatalFault {
ObjectMapper mapper = new ObjectMapper();
HashMap jsonMap;
try {
jsonMap = mapper.readValue(new File(transformationPath), HashMap.class);
} catch (IOException e) {
throw new FatalFault("Failed to parse", e);
}
return jsonMap;
}
private static ProgramArguments parseProgramArguments(String[] args) throws FatalFault {
Options options = createOptions();
CommandLineParser commandLineParser = new BasicParser();
try {
final CommandLine commandOutput = commandLineParser.parse(options, args);
final String[] seedUrls = commandOutput.getOptionValues("seedUrls");
final String configurationPath = commandOutput.getOptionValue("configurationPath");
return new ProgramArguments(configurationPath, Lists.newArrayList(seedUrls));
} catch (ParseException e) {
log.error("Failed to parse the command input {}", (Object[]) args);
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("crawler.jar", options);
throw new FatalFault(e);
}
}
private static Options createOptions() {
final Options options = new Options();
createOption(options, true, " seed urls ", "seedUrls", true);
createOption(options, true, " path to configuration file name ", "configurationPath", false);
return options;
}
@SuppressWarnings("static-access")
private static void createOption(final Options options, final boolean isRequired, final String description, final String parameterName, final boolean hasMoreArgs) {
final OptionBuilder isRequiredOption = OptionBuilder.isRequired(isRequired);
final OptionBuilder withArgName = isRequiredOption.withArgName(parameterName);
OptionBuilder hasArg;
if (hasMoreArgs) {
hasArg = withArgName.hasArgs();
} else {
hasArg = withArgName.hasArg();
}
hasArg.withDescription(description);
Option parameter = OptionBuilder
.create(parameterName);
options.addOption(parameter);
}
public void executeCrawler(final CrawlerConfiguration crawlerConfiguration, final CrawlerWireObject crawlerWireObject) throws FatalFault {
log.info("Reading configuration");
actorSystem = ActorSystem.create(CRAWLER_SYSTEM);
final ActorRef coordinationActor = actorSystem.actorOf(CoordinatorActor.props(crawlerWireObject, crawlerConfiguration), COORDINATION_ACTOR);
coordinationActor.tell(new StartSystem(crawlerConfiguration), ActorRef.noSender());
final Long maxDuration = crawlerConfiguration.getMaxDuration();
if (maxDuration > 0L) {
log.info("The crawler will run for {} hour(s) or until Ctrl+C is called", maxDuration);
actorSystem.awaitTermination(create(maxDuration, TimeUnit.HOURS));
} else {
log.info("No duration specified in the configuration file. The crawler will run forever until Ctrl+C is pressed");
actorSystem.awaitTermination();
}
}
public void killCrawler() {
log.info("Will try to find actor {}", CrawlerConfiguration.COORDINATION_ACTOR);
final ActorSelection actorSelection = actorSystem.actorSelection(String.format("/user/%s", COORDINATION_ACTOR));
log.info("Will try to send a poison message to {}", COORDINATION_ACTOR);
actorSelection.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
private static CrawlerConfiguration parseAndGetConfigurationFromFolder(final String configurationPath, final ProgramArguments programArguments) throws FatalFault {
CrawlerConfiguration crawlerConfiguration;
try {
Configuration configuration = new PropertiesConfiguration(configurationPath);
final String[] filterArray = getStringArray(configurationPath, configuration, "crawler.url.filters");
final String indexName = getString(configurationPath, configuration, "crawler.indexName");
final String indexType = getString(configurationPath, configuration, "crawler.indexType");
final String indexHost = getString(configurationPath, configuration, "crawler.indexHost");
final Integer indexPort = getInteger(configurationPath, configuration, "crawler.indexPort");
final Long maxDuration = getLong(configurationPath, configuration, "crawler.maxDuration");
final String contentRetrievalExpression = getString(configurationPath, configuration, "crawler.contentRetrievalExpression");
final String attributeToLocateContent = getString(configurationPath, configuration, "crawler.attributeToLocateContent");
final Integer maxDepth = getInteger(configurationPath, configuration, "crawler.maxDepth");
final Boolean createEmbeddedElasticNode = getBoolean(configuration, "elastic.createEmbeddedElasticNode");
final String htmlTransformationPath = getString(configurationPath, configuration, "crawler.htmlTransformationPath");
final String jsonTransformationPath = getString(configurationPath, configuration, "crawler.jsonTransformationPath");
final String cleanDataFilePath = getString(configurationPath, configuration, "crawler.cleanFileDescriptionPath");
final Integer urlsPer = getInteger(configurationPath, configuration, "crawler.throttler.urlsPer");
final Integer seconds = getInteger(configurationPath, configuration, "crawler.throttler.seconds");
final Boolean disableThrottle = getBoolean(configuration, "crawler.throttler.disable");
final String validatorPath = getString(configurationPath, configuration, "crawler.validatorPath");
final String[] fieldToFindNextUrl = getStringArray(configurationPath, configuration, "crawler.url.json.fieldToFindNextUrl");
final Boolean jsonTransformationEnabled = getBoolean(configuration, "crawler.isJsonTransformationEnabled");
final Boolean shouldIndexRawContent = getBoolean(configuration, "crawler.shouldIndexRawContent");
crawlerConfiguration = new CrawlerConfigurationBuilder().setValidatorPath(validatorPath)
.setFilters(Lists.newArrayList(filterArray)).setIndexName(indexName)
.setIndexType(indexType).setIndexNodeHost(indexHost)
.setIndexNodePort(indexPort)
.setMaxDuration(maxDuration)
.setSeedUrls(programArguments
.getSeedUrls())
.setContentRetrievalExpression(contentRetrievalExpression)
.setAttributeToLocateContent(attributeToLocateContent)
.setCreateEmbeddedElasticNode(createEmbeddedElasticNode)
.setMaxDepth(maxDepth)
.setTransformationPath(htmlTransformationPath)
.setCleanDataFilePath(cleanDataFilePath)
.setSeconds(seconds)
.setDisableThrottle(disableThrottle)
.setUrlsPer(urlsPer)
.setJsonTransformationPath(jsonTransformationPath)
.setFieldToFindNextUrl(Lists.newArrayList(fieldToFindNextUrl))
.setJsonTransformationEnabled(jsonTransformationEnabled)
.setShouldIndexRawContent(shouldIndexRawContent).createCrawlerConfiguration();
} catch (ConfigurationException e) {
log.error("Failed to parse configuration file {}", configurationPath);
throw new FatalFault(e);
}
return crawlerConfiguration;
}
private static Boolean getBoolean(final Configuration configuration, final String configurationProperty) {
return configuration.getBoolean(configurationProperty, false);
}
private static Long getLong(String configurationPath, Configuration configuration, String configurationProperty) throws ConfigurationException {
final Long longValue = configuration.getLong(configurationProperty, -2L);
if (longValue == -2L) {
throw new ConfigurationException(format("Failed to read configuration file %s. The property %s can not be validated", configurationPath, configurationProperty));
}
log.debug("Reading configuration field {} and found {} ", configurationProperty, longValue);
return longValue;
}
private static String getString(String configurationPath, Configuration configuration, String configurationProperty) throws ConfigurationException {
final String propertyValue = configuration.getString(configurationProperty);
if (propertyValue == null || propertyValue.isEmpty()) {
throw new ConfigurationException(format("Failed to read configuration file %s. The property %s can not be validated", configurationPath, configurationProperty));
}
log.debug("Reading configuration field {} and found {} ", configurationProperty, propertyValue);
return propertyValue;
}
private static Integer getInteger(String configurationPath, Configuration configuration, String configurationProperty) throws ConfigurationException {
final Integer integerValue = configuration.getInteger(configurationProperty, -2);
if (integerValue == -2) {
throw new ConfigurationException(format("Failed to read configuration file %s. The property %s can not be validated", configurationPath, configurationProperty));
}
log.debug("Reading configuration field {} and found {} ", configurationProperty, integerValue);
return integerValue;
}
private static String[] getStringArray(String configurationPath, Configuration configuration, String crawlerUrlFiltersProperty) throws ConfigurationException {
final String[] filterArray = configuration.getStringArray(crawlerUrlFiltersProperty);
if (filterArray == null || filterArray.length == 0) {
throw new ConfigurationException(format("Failed to read configuration file %s. The property %s can not be validated", configurationPath, crawlerUrlFiltersProperty));
}
log.debug("Reading configuration field {} and found {} ", crawlerUrlFiltersProperty, filterArray);
return filterArray;
}
}