package io.monokkel; import akka.actor.ActorRef; import akka.actor.ActorSelection; import akka.actor.ActorSystem; import akka.actor.PoisonPill; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Lists; import io.monokkel.actors.CoordinatorActor; import io.monokkel.configuration.CrawlerConfiguration; import io.monokkel.configuration.CrawlerConfigurationBuilder; import io.monokkel.configuration.CrawlerWireObject; import io.monokkel.core.*; import io.monokkel.exceptions.FatalFault; import io.monokkel.factories.ClientFactory; import io.monokkel.factories.HttpClient4Builder; import io.monokkel.messages.StartSystem; import org.apache.commons.cli.*; import org.apache.commons.configuration.Configuration; import org.apache.commons.configuration.ConfigurationException; import org.apache.commons.configuration.PropertiesConfiguration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; import static io.monokkel.configuration.CrawlerConfiguration.COORDINATION_ACTOR; import static java.lang.String.format; import static scala.concurrent.duration.Duration.create; public class Crawler { public static final String CRAWLER_SYSTEM = "crawlerSystem"; private static Logger log = LoggerFactory.getLogger(Crawler.class.getClass()); private ActorSystem actorSystem; public static void main(String[] args) { Crawler crawler = new Crawler(); try { ProgramArguments programArguments = parseProgramArguments(args); final String configurationPath = programArguments.getConfigurationPath(); CrawlerConfiguration crawlerConfiguration = parseAndGetConfigurationFromFolder(configurationPath, programArguments); CrawlerWireObject crawlerWireObject = instanceAndPopulateWireObject(crawlerConfiguration); final Thread mainThread = Thread.currentThread(); Runtime.getRuntime().addShutdownHook(new Thread() { public void run() { crawler.killCrawler(); try { mainThread.join(); } catch (InterruptedException e) { log.warn("Shutdown process interrupted", e); } } }); crawler.executeCrawler(crawlerConfiguration, crawlerWireObject); } catch (FatalFault e) { log.error("Crawler terminated with an error exit 1", e); System.exit(1); } } @SuppressWarnings("unchecked") private static CrawlerWireObject instanceAndPopulateWireObject(CrawlerConfiguration crawlerConfiguration) throws FatalFault { final HttpClient4Builder httpClient4Builder = new HttpClient4Builder(); final UrlVisitor urlvisitor = new UrlVisitor(httpClient4Builder); final HashMap transformationMap = readTransformationFile(crawlerConfiguration.getTransformationPath()); final HtmlTransformer htmlTransformer = new HtmlTransformer(crawlerConfiguration.getFilters(), crawlerConfiguration.getAttributeToLocateContent(), crawlerConfiguration.getContentRetrievalExpression(), transformationMap); final ClientFactory clientFactory = new ClientFactory(crawlerConfiguration.getIndexNodeHost(), crawlerConfiguration.getIndexNodePort()); final DataCleaner dataCleaner = new DataCleaner(readTransformationFile(crawlerConfiguration.getCleanDataFilePath())); JsonSupport jsonTransformer; final List<String> fieldToFiendNextUrl = crawlerConfiguration.getFieldToFiendNextUrl(); if (crawlerConfiguration.isJsonTransformationEnabled()) { final Map<String, List<String>> hashMap = readTransformationFile(crawlerConfiguration.getJsonTransformationPath()); jsonTransformer = new JsonTransformer(hashMap, fieldToFiendNextUrl); } else { jsonTransformer = new NoOperationsJsonParser(fieldToFiendNextUrl); } String indexName = crawlerConfiguration.getIndexName(); String indexType = crawlerConfiguration.getIndexType(); Indexer indexer = new Indexer(indexName, indexType, clientFactory, crawlerConfiguration.isShouldIndexRawContent()); DataValidator dataValidator = new DataValidator(readTransformationFile(crawlerConfiguration.getValidatorPath())); return new CrawlerWireObject(urlvisitor, htmlTransformer, indexer, dataCleaner, dataValidator, jsonTransformer); } private static HashMap readTransformationFile(final String transformationPath) throws FatalFault { ObjectMapper mapper = new ObjectMapper(); HashMap jsonMap; try { jsonMap = mapper.readValue(new File(transformationPath), HashMap.class); } catch (IOException e) { throw new FatalFault("Failed to parse", e); } return jsonMap; } private static ProgramArguments parseProgramArguments(String[] args) throws FatalFault { Options options = createOptions(); CommandLineParser commandLineParser = new BasicParser(); try { final CommandLine commandOutput = commandLineParser.parse(options, args); final String[] seedUrls = commandOutput.getOptionValues("seedUrls"); final String configurationPath = commandOutput.getOptionValue("configurationPath"); return new ProgramArguments(configurationPath, Lists.newArrayList(seedUrls)); } catch (ParseException e) { log.error("Failed to parse the command input {}", (Object[]) args); HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("crawler.jar", options); throw new FatalFault(e); } } private static Options createOptions() { final Options options = new Options(); createOption(options, true, " seed urls ", "seedUrls", true); createOption(options, true, " path to configuration file name ", "configurationPath", false); return options; } @SuppressWarnings("static-access") private static void createOption(final Options options, final boolean isRequired, final String description, final String parameterName, final boolean hasMoreArgs) { final OptionBuilder isRequiredOption = OptionBuilder.isRequired(isRequired); final OptionBuilder withArgName = isRequiredOption.withArgName(parameterName); OptionBuilder hasArg; if (hasMoreArgs) { hasArg = withArgName.hasArgs(); } else { hasArg = withArgName.hasArg(); } hasArg.withDescription(description); Option parameter = OptionBuilder .create(parameterName); options.addOption(parameter); } public void executeCrawler(final CrawlerConfiguration crawlerConfiguration, final CrawlerWireObject crawlerWireObject) throws FatalFault { log.info("Reading configuration"); actorSystem = ActorSystem.create(CRAWLER_SYSTEM); final ActorRef coordinationActor = actorSystem.actorOf(CoordinatorActor.props(crawlerWireObject, crawlerConfiguration), COORDINATION_ACTOR); coordinationActor.tell(new StartSystem(crawlerConfiguration), ActorRef.noSender()); final Long maxDuration = crawlerConfiguration.getMaxDuration(); if (maxDuration > 0L) { log.info("The crawler will run for {} hour(s) or until Ctrl+C is called", maxDuration); actorSystem.awaitTermination(create(maxDuration, TimeUnit.HOURS)); } else { log.info("No duration specified in the configuration file. The crawler will run forever until Ctrl+C is pressed"); actorSystem.awaitTermination(); } } public void killCrawler() { log.info("Will try to find actor {}", CrawlerConfiguration.COORDINATION_ACTOR); final ActorSelection actorSelection = actorSystem.actorSelection(String.format("/user/%s", COORDINATION_ACTOR)); log.info("Will try to send a poison message to {}", COORDINATION_ACTOR); actorSelection.tell(PoisonPill.getInstance(), ActorRef.noSender()); } private static CrawlerConfiguration parseAndGetConfigurationFromFolder(final String configurationPath, final ProgramArguments programArguments) throws FatalFault { CrawlerConfiguration crawlerConfiguration; try { Configuration configuration = new PropertiesConfiguration(configurationPath); final String[] filterArray = getStringArray(configurationPath, configuration, "crawler.url.filters"); final String indexName = getString(configurationPath, configuration, "crawler.indexName"); final String indexType = getString(configurationPath, configuration, "crawler.indexType"); final String indexHost = getString(configurationPath, configuration, "crawler.indexHost"); final Integer indexPort = getInteger(configurationPath, configuration, "crawler.indexPort"); final Long maxDuration = getLong(configurationPath, configuration, "crawler.maxDuration"); final String contentRetrievalExpression = getString(configurationPath, configuration, "crawler.contentRetrievalExpression"); final String attributeToLocateContent = getString(configurationPath, configuration, "crawler.attributeToLocateContent"); final Integer maxDepth = getInteger(configurationPath, configuration, "crawler.maxDepth"); final Boolean createEmbeddedElasticNode = getBoolean(configuration, "elastic.createEmbeddedElasticNode"); final String htmlTransformationPath = getString(configurationPath, configuration, "crawler.htmlTransformationPath"); final String jsonTransformationPath = getString(configurationPath, configuration, "crawler.jsonTransformationPath"); final String cleanDataFilePath = getString(configurationPath, configuration, "crawler.cleanFileDescriptionPath"); final Integer urlsPer = getInteger(configurationPath, configuration, "crawler.throttler.urlsPer"); final Integer seconds = getInteger(configurationPath, configuration, "crawler.throttler.seconds"); final Boolean disableThrottle = getBoolean(configuration, "crawler.throttler.disable"); final String validatorPath = getString(configurationPath, configuration, "crawler.validatorPath"); final String[] fieldToFindNextUrl = getStringArray(configurationPath, configuration, "crawler.url.json.fieldToFindNextUrl"); final Boolean jsonTransformationEnabled = getBoolean(configuration, "crawler.isJsonTransformationEnabled"); final Boolean shouldIndexRawContent = getBoolean(configuration, "crawler.shouldIndexRawContent"); crawlerConfiguration = new CrawlerConfigurationBuilder().setValidatorPath(validatorPath) .setFilters(Lists.newArrayList(filterArray)).setIndexName(indexName) .setIndexType(indexType).setIndexNodeHost(indexHost) .setIndexNodePort(indexPort) .setMaxDuration(maxDuration) .setSeedUrls(programArguments .getSeedUrls()) .setContentRetrievalExpression(contentRetrievalExpression) .setAttributeToLocateContent(attributeToLocateContent) .setCreateEmbeddedElasticNode(createEmbeddedElasticNode) .setMaxDepth(maxDepth) .setTransformationPath(htmlTransformationPath) .setCleanDataFilePath(cleanDataFilePath) .setSeconds(seconds) .setDisableThrottle(disableThrottle) .setUrlsPer(urlsPer) .setJsonTransformationPath(jsonTransformationPath) .setFieldToFindNextUrl(Lists.newArrayList(fieldToFindNextUrl)) .setJsonTransformationEnabled(jsonTransformationEnabled) .setShouldIndexRawContent(shouldIndexRawContent).createCrawlerConfiguration(); } catch (ConfigurationException e) { log.error("Failed to parse configuration file {}", configurationPath); throw new FatalFault(e); } return crawlerConfiguration; } private static Boolean getBoolean(final Configuration configuration, final String configurationProperty) { return configuration.getBoolean(configurationProperty, false); } private static Long getLong(String configurationPath, Configuration configuration, String configurationProperty) throws ConfigurationException { final Long longValue = configuration.getLong(configurationProperty, -2L); if (longValue == -2L) { throw new ConfigurationException(format("Failed to read configuration file %s. The property %s can not be validated", configurationPath, configurationProperty)); } log.debug("Reading configuration field {} and found {} ", configurationProperty, longValue); return longValue; } private static String getString(String configurationPath, Configuration configuration, String configurationProperty) throws ConfigurationException { final String propertyValue = configuration.getString(configurationProperty); if (propertyValue == null || propertyValue.isEmpty()) { throw new ConfigurationException(format("Failed to read configuration file %s. The property %s can not be validated", configurationPath, configurationProperty)); } log.debug("Reading configuration field {} and found {} ", configurationProperty, propertyValue); return propertyValue; } private static Integer getInteger(String configurationPath, Configuration configuration, String configurationProperty) throws ConfigurationException { final Integer integerValue = configuration.getInteger(configurationProperty, -2); if (integerValue == -2) { throw new ConfigurationException(format("Failed to read configuration file %s. The property %s can not be validated", configurationPath, configurationProperty)); } log.debug("Reading configuration field {} and found {} ", configurationProperty, integerValue); return integerValue; } private static String[] getStringArray(String configurationPath, Configuration configuration, String crawlerUrlFiltersProperty) throws ConfigurationException { final String[] filterArray = configuration.getStringArray(crawlerUrlFiltersProperty); if (filterArray == null || filterArray.length == 0) { throw new ConfigurationException(format("Failed to read configuration file %s. The property %s can not be validated", configurationPath, crawlerUrlFiltersProperty)); } log.debug("Reading configuration field {} and found {} ", crawlerUrlFiltersProperty, filterArray); return filterArray; } }