package eu.europeana.cloud.service.dps.storm.topologies.extractandindex; import backtype.storm.Config; import backtype.storm.LocalCluster; import backtype.storm.StormSubmitter; import backtype.storm.generated.AlreadyAliveException; import backtype.storm.generated.AuthorizationException; import backtype.storm.generated.InvalidTopologyException; import backtype.storm.generated.StormTopology; import backtype.storm.spout.SchemeAsMultiScheme; import backtype.storm.testing.FeederSpout; import backtype.storm.topology.IRichSpout; import backtype.storm.topology.TopologyBuilder; import backtype.storm.tuple.Fields; import backtype.storm.utils.Utils; import eu.europeana.cloud.service.dps.PluginParameterKeys; import eu.europeana.cloud.service.dps.index.SupportedIndexers; import eu.europeana.cloud.service.dps.storm.AbstractDpsBolt; import eu.europeana.cloud.service.dps.storm.NotificationBolt; import eu.europeana.cloud.service.dps.storm.NotificationTuple; import eu.europeana.cloud.service.dps.storm.ParseTaskBolt; import eu.europeana.cloud.service.dps.storm.io.ReadDatasetBolt; import eu.europeana.cloud.service.dps.storm.io.ReadFileBolt; import eu.europeana.cloud.service.dps.storm.io.StoreFileAsRepresentationBolt; import eu.europeana.cloud.service.dps.storm.kafka.KafkaMetricsConsumer; import eu.europeana.cloud.service.dps.storm.topologies.indexer.IndexBolt; import eu.europeana.cloud.service.dps.storm.topologies.text.ExtractTextBolt; import java.util.HashMap; import java.util.Map; import java.util.UUID; import storm.kafka.KafkaSpout; import storm.kafka.SpoutConfig; import storm.kafka.StringScheme; import storm.kafka.ZkHosts; /** * This topology combines {@link TextStrippingTopology} and {@link IndexTopology}. * * @author Pavel Kefurt <Pavel.Kefurt@gmail.com> */ public class ExtractAndIndexTopology { public enum SpoutType { KAFKA, FEEDER } private final SpoutType spoutType; private final String datasetStream = "ReadDataset"; private final String fileStream = "ReadFile"; private final String storeStream = "StoreStream"; private final String indexStream = "IndexStream"; private final String ecloudMcsAddress = ExtractAndIndexConstants.MCS_URL; private final String username = ExtractAndIndexConstants.USERNAME; private final String password = ExtractAndIndexConstants.PASSWORD; /** * Constructor of extract and index topology. * * @param spoutType type of spout */ public ExtractAndIndexTopology(SpoutType spoutType) { this.spoutType = spoutType; } protected StormTopology buildTopology() { Map<SupportedIndexers, String> indexersAddresses = new HashMap<>(); indexersAddresses.put(SupportedIndexers.ELASTICSEARCH_INDEXER, ExtractAndIndexConstants.ELASTICSEARCH_ADDRESSES); indexersAddresses.put(SupportedIndexers.SOLR_INDEXER, ExtractAndIndexConstants.SOLR_ADDRESSES); Map<String, String> routingRules = new HashMap<>(); routingRules.put(PluginParameterKeys.NEW_DATASET_MESSAGE, datasetStream); routingRules.put(PluginParameterKeys.NEW_FILE_MESSAGE, fileStream); Map<String, String> prerequisites = new HashMap<>(); prerequisites.put(PluginParameterKeys.EXTRACT_TEXT, "True"); prerequisites.put(PluginParameterKeys.INDEX_DATA, "True"); prerequisites.put(PluginParameterKeys.INDEXER, null); TopologyBuilder builder = new TopologyBuilder(); builder.setSpout("KafkaSpout", getSpout(), ExtractAndIndexConstants.KAFKA_SPOUT_PARALLEL); builder.setBolt("ParseDpsTask", new ParseTaskBolt(routingRules, prerequisites), ExtractAndIndexConstants.PARSE_TASKS_BOLT_PARALLEL) .shuffleGrouping("KafkaSpout"); builder.setBolt("RetrieveDataset", new ReadDatasetBolt(ecloudMcsAddress, username, password), ExtractAndIndexConstants.DATASET_BOLT_PARALLEL) .shuffleGrouping("ParseDpsTask", datasetStream); builder.setBolt("RetrieveFile", new ReadFileBolt(ecloudMcsAddress, username, password), ExtractAndIndexConstants.FILE_BOLT_PARALLEL) .shuffleGrouping("ParseDpsTask", fileStream); builder.setBolt("ExtractText", new ExtractTextBolt(indexStream, storeStream), ExtractAndIndexConstants.EXTRACT_BOLT_PARALLEL) .shuffleGrouping("RetrieveDataset") .shuffleGrouping("RetrieveFile"); builder.setBolt("StoreNewRepresentation", new StoreFileAsRepresentationBolt(ecloudMcsAddress, username, password), ExtractAndIndexConstants.STORE_BOLT_PARALLEL) .shuffleGrouping("ExtractText", storeStream); builder.setBolt("IndexBolt", new IndexBolt(indexersAddresses, ExtractAndIndexConstants.CACHE_SIZE), ExtractAndIndexConstants.INDEX_BOLT_PARALLEL) .shuffleGrouping("ExtractText", indexStream) .shuffleGrouping("StoreNewRepresentation"); builder.setBolt("NotificationBolt", new NotificationBolt(ExtractAndIndexConstants.CASSANDRA_HOSTS, ExtractAndIndexConstants.CASSANDRA_PORT, ExtractAndIndexConstants.CASSANDRA_KEYSPACE_NAME, ExtractAndIndexConstants.CASSANDRA_USERNAME, ExtractAndIndexConstants.CASSANDRA_PASSWORD), ExtractAndIndexConstants.NOTIFICATION_BOLT_PARALLEL) .fieldsGrouping("ParseDpsTask", AbstractDpsBolt.NOTIFICATION_STREAM_NAME, new Fields(NotificationTuple.taskIdFieldName)) .fieldsGrouping("RetrieveDataset", AbstractDpsBolt.NOTIFICATION_STREAM_NAME, new Fields(NotificationTuple.taskIdFieldName)) .fieldsGrouping("RetrieveFile", AbstractDpsBolt.NOTIFICATION_STREAM_NAME, new Fields(NotificationTuple.taskIdFieldName)) .fieldsGrouping("ExtractText", AbstractDpsBolt.NOTIFICATION_STREAM_NAME, new Fields(NotificationTuple.taskIdFieldName)) .fieldsGrouping("StoreNewRepresentation", AbstractDpsBolt.NOTIFICATION_STREAM_NAME, new Fields(NotificationTuple.taskIdFieldName)) .fieldsGrouping("IndexBolt", AbstractDpsBolt.NOTIFICATION_STREAM_NAME, new Fields(NotificationTuple.taskIdFieldName)); return builder.createTopology(); } private IRichSpout getSpout() { switch (spoutType) { case FEEDER: return new FeederSpout(new StringScheme().getOutputFields()); case KAFKA: default: SpoutConfig kafkaConfig = new SpoutConfig( new ZkHosts(ExtractAndIndexConstants.INPUT_ZOOKEEPER), ExtractAndIndexConstants.KAFKA_INPUT_TOPIC, ExtractAndIndexConstants.ZOOKEEPER_ROOT, UUID.randomUUID().toString()); kafkaConfig.scheme = new SchemeAsMultiScheme(new StringScheme()); return new KafkaSpout(kafkaConfig); } } /** * @param args the command line arguments * <ol> * <li>topology name (e.g. index_topology)</li> * <li>number of workers (e.g. 1)</li> * <li>max task parallelism (e.g. 1)</li> * <!-- * <li>zookeeper servers (e.g. localhost;another.server.com) - STORM_ZOOKEEPER_SERVERS</li> * <li>zookeeper port (e.g. 2181) - STORM_ZOOKEEPER_PORT</li> * <li>nimbus host (e.g. localhost) - NIMBUS_HOST</li> * <li>nimbus port (e.g. 6627) - NIMBUS_THRIFT_PORT</li> * --> * <li>JVM parameters (e.g. "-Dhttp.proxyHost=xxx -Dhttp.proxyPort=xx") - TOPOLOGY_WORKER_CHILDOPTS</li> * </ol> * @throws backtype.storm.generated.AlreadyAliveException * @throws backtype.storm.generated.InvalidTopologyException * @throws backtype.storm.generated.AuthorizationException */ public static void main(String[] args) throws AlreadyAliveException, InvalidTopologyException, AuthorizationException { ExtractAndIndexTopology extractAndIndexTopology = new ExtractAndIndexTopology(SpoutType.KAFKA); Config config = new Config(); config.setDebug(false); /* Map<String, String> kafkaMetricsConfig = new HashMap<>(); kafkaMetricsConfig.put(KafkaMetricsConsumer.KAFKA_BROKER_KEY, ExtractAndIndexConstants.KAFKA_METRICS_BROKER); kafkaMetricsConfig.put(KafkaMetricsConsumer.KAFKA_TOPIC_KEY, ExtractAndIndexConstants.KAFKA_METRICS_TOPIC); config.registerMetricsConsumer(KafkaMetricsConsumer.class, kafkaMetricsConfig, ExtractAndIndexConstants.METRICS_CONSUMER_PARALLEL); */ StormTopology stormTopology = extractAndIndexTopology.buildTopology(); if (args != null && args.length > 1) { config.setNumWorkers(Integer.parseInt(args[1])); config.setMaxTaskParallelism(Integer.parseInt(args[2])); /* config.put(Config.NIMBUS_THRIFT_PORT, Integer.parseInt(args[6])); config.put(Config.STORM_ZOOKEEPER_PORT, Integer.parseInt(args[4])); config.put(Config.NIMBUS_HOST, args[5]); config.put(Config.STORM_ZOOKEEPER_SERVERS, Arrays.asList(args[3].split(";"))); */ if (args.length >= 4) { config.put(Config.TOPOLOGY_WORKER_CHILDOPTS, args[3]); } StormSubmitter.submitTopology(args[0], config, stormTopology); } else { config.setNumWorkers(1); config.setMaxTaskParallelism(1); LocalCluster cluster = new LocalCluster(); cluster.submitTopology("ExtractAndIndexTopology", config, stormTopology); Utils.sleep(6000000); cluster.killTopology("ExtractAndIndexTopology"); cluster.shutdown(); } } }