package eu.europeana.cloud.service.dps.storm.topologies.xslt;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import eu.europeana.cloud.service.dps.DpsTask;
import eu.europeana.cloud.service.dps.PluginParameterKeys;
import eu.europeana.cloud.service.dps.storm.io.*;
import eu.europeana.cloud.service.dps.storm.utils.TopologyHelper;
import eu.europeana.cloud.service.dps.storm.AbstractDpsBolt;
import eu.europeana.cloud.service.dps.storm.NotificationBolt;
import eu.europeana.cloud.service.dps.storm.NotificationTuple;
import eu.europeana.cloud.service.dps.storm.ParseTaskBolt;
import eu.europeana.cloud.service.dps.storm.topologies.properties.PropertyFileLoader;
import eu.europeana.cloud.service.dps.storm.topologies.properties.TopologyPropertyKeys;
import eu.europeana.cloud.service.dps.storm.xslt.XsltBolt;
import org.apache.storm.Config;
import org.apache.storm.StormSubmitter;
import org.apache.storm.generated.StormTopology;
import org.apache.storm.kafka.*;
import org.apache.storm.spout.SchemeAsMultiScheme;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;
/**
* This is the XSLT transformation topology for Apache Storm. The topology reads
* from the cloud, download an XSLT sheet from a remote server, apply it to each
* record read and save it back to the cloud.
*
* @author Franco Maria Nardini (francomaria.nardini@isti.cnr.it)
*/
public class XSLTTopology {
private static Properties topologyProperties;
private final BrokerHosts brokerHosts;
private final static String TOPOLOGY_PROPERTIES_FILE = "xslt-topology-config.properties";
private final String DATASET_STREAM = DpsTask.DATASET_URLS;
private final String FILE_STREAM = DpsTask.FILE_URLS;
public XSLTTopology(String defaultPropertyFile, String providedPropertyFile) {
topologyProperties = new Properties();
PropertyFileLoader.loadPropertyFile(defaultPropertyFile, providedPropertyFile, topologyProperties);
brokerHosts = new ZkHosts(topologyProperties.getProperty(TopologyPropertyKeys.INPUT_ZOOKEEPER_ADDRESS));
}
public StormTopology buildTopology(String xsltTopic, String ecloudMcsAddress) {
Map<String, String> routingRules = new HashMap<>();
routingRules.put(PluginParameterKeys.FILE_URLS, DATASET_STREAM);
routingRules.put(PluginParameterKeys.DATASET_URLS, FILE_STREAM);
ReadFileBolt retrieveFileBolt = new ReadFileBolt(ecloudMcsAddress);
WriteRecordBolt writeRecordBolt = new WriteRecordBolt(ecloudMcsAddress);
SpoutConfig kafkaConfig = new SpoutConfig(brokerHosts, xsltTopic, "", "storm");
kafkaConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
// kafkaConfig. forceFromStart = true;
kafkaConfig.startOffsetTime = kafka.api.OffsetRequest.LatestTime();
TopologyBuilder builder = new TopologyBuilder();
KafkaSpout kafkaSpout = new KafkaSpout(kafkaConfig);
// TOPOLOGY STRUCTURE!
builder.setSpout(TopologyHelper.SPOUT, kafkaSpout,
((int) Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.KAFKA_SPOUT_PARALLEL))))
.setNumTasks(
((int) Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.KAFKA_SPOUT_NUMBER_OF_TASKS))));
builder.setBolt(TopologyHelper.PARSE_TASK_BOLT, new ParseTaskBolt(routingRules),
((int) Integer
.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.PARSE_TASKS_BOLT_PARALLEL))))
.setNumTasks(
((int) Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.PARSE_TASKS_BOLT_NUMBER_OF_TASKS))))
.shuffleGrouping(TopologyHelper.SPOUT);
builder.setBolt(TopologyHelper.READ_DATASETS_BOLT, new ReadDatasetsBolt(),
((int) Integer
.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.READ_DATASETS_BOLT_PARALLEL))))
.setNumTasks(
((int) Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.READ_DATASETS_BOLT_NUMBER_OF_TASKS))))
.shuffleGrouping(TopologyHelper.PARSE_TASK_BOLT, DATASET_STREAM);
builder.setBolt(TopologyHelper.READ_DATASET_BOLT, new ReadDatasetBolt(ecloudMcsAddress),
((int) Integer
.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.READ_DATASET_BOLT_PARALLEL))))
.setNumTasks(
((int) Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.READ_DATASET_BOLT_NUMBER_OF_TASKS))))
.shuffleGrouping(TopologyHelper.READ_DATASETS_BOLT);
builder.setBolt(TopologyHelper.READ_REPRESENTATION_BOLT, new ReadRepresentationBolt(ecloudMcsAddress),
((int) Integer
.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.READ_REPRESENTATION_BOLT_PARALLEL))))
.setNumTasks(
((int) Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.READ_REPRESENTATION_BOLT_NUMBER_OF_TASKS))))
.shuffleGrouping(TopologyHelper.READ_DATASET_BOLT);
builder.setBolt(TopologyHelper.RETRIEVE_FILE_BOLT, retrieveFileBolt,
((int) Integer
.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.RETRIEVE_FILE_BOLT_PARALLEL))))
.setNumTasks(
((int) Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.RETRIEVE_FILE_BOLT_NUMBER_OF_TASKS))))
.shuffleGrouping(TopologyHelper.PARSE_TASK_BOLT, FILE_STREAM).shuffleGrouping(TopologyHelper.READ_REPRESENTATION_BOLT);
builder.setBolt(TopologyHelper.XSLT_BOLT, new XsltBolt(),
((int) Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.XSLT_BOLT_PARALLEL))))
.setNumTasks(
((int) Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.XSLT_BOLT_NUMBER_OF_TASKS))))
.shuffleGrouping(TopologyHelper.RETRIEVE_FILE_BOLT);
builder.setBolt(TopologyHelper.WRITE_RECORD_BOLT, writeRecordBolt,
((int) Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.WRITE_BOLT_PARALLEL))))
.setNumTasks(
((int) Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.WRITE_BOLT_NUMBER_OF_TASKS))))
.shuffleGrouping(TopologyHelper.XSLT_BOLT);
AddResultToDataSetBolt addResultToDataSetBolt = new AddResultToDataSetBolt(ecloudMcsAddress);
builder.setBolt(TopologyHelper.WRITE_TO_DATA_SET_BOLT, addResultToDataSetBolt,
((int) Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.ADD_TO_DATASET_BOLT_PARALLEL))))
.setNumTasks(
((int) Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.ADD_TO_DATASET_BOLT_NUMBER_OF_TASKS))))
.shuffleGrouping(TopologyHelper.WRITE_RECORD_BOLT);
builder.setBolt(TopologyHelper.NOTIFICATION_BOLT, new NotificationBolt(topologyProperties.getProperty(TopologyPropertyKeys.CASSANDRA_HOSTS),
Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.CASSANDRA_PORT)),
topologyProperties.getProperty(TopologyPropertyKeys.CASSANDRA_KEYSPACE_NAME),
topologyProperties.getProperty(TopologyPropertyKeys.CASSANDRA_USERNAME),
topologyProperties.getProperty(TopologyPropertyKeys.CASSANDRA_PASSWORD)),
Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.NOTIFICATION_BOLT_PARALLEL)))
.setNumTasks(
((int) Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.NOTIFICATION_BOLT_NUMBER_OF_TASKS))))
.fieldsGrouping(TopologyHelper.PARSE_TASK_BOLT, AbstractDpsBolt.NOTIFICATION_STREAM_NAME,
new Fields(NotificationTuple.taskIdFieldName))
.fieldsGrouping(TopologyHelper.RETRIEVE_FILE_BOLT, AbstractDpsBolt.NOTIFICATION_STREAM_NAME,
new Fields(NotificationTuple.taskIdFieldName))
.fieldsGrouping(TopologyHelper.READ_DATASETS_BOLT, AbstractDpsBolt.NOTIFICATION_STREAM_NAME,
new Fields(NotificationTuple.taskIdFieldName))
.fieldsGrouping(TopologyHelper.READ_DATASET_BOLT, AbstractDpsBolt.NOTIFICATION_STREAM_NAME,
new Fields(NotificationTuple.taskIdFieldName))
.fieldsGrouping(TopologyHelper.READ_REPRESENTATION_BOLT, AbstractDpsBolt.NOTIFICATION_STREAM_NAME,
new Fields(NotificationTuple.taskIdFieldName))
.fieldsGrouping(TopologyHelper.XSLT_BOLT, AbstractDpsBolt.NOTIFICATION_STREAM_NAME,
new Fields(NotificationTuple.taskIdFieldName))
.fieldsGrouping(TopologyHelper.WRITE_RECORD_BOLT, AbstractDpsBolt.NOTIFICATION_STREAM_NAME,
new Fields(NotificationTuple.taskIdFieldName))
.fieldsGrouping(TopologyHelper.WRITE_TO_DATA_SET_BOLT, AbstractDpsBolt.NOTIFICATION_STREAM_NAME,
new Fields(NotificationTuple.taskIdFieldName));
return builder.createTopology();
}
public static void main(String[] args) throws Exception {
Config config = new Config();
config.put(Config.TOPOLOGY_TRIDENT_BATCH_EMIT_INTERVAL_MILLIS, 2000);
if (args.length <= 1) {
String providedPropertyFile = "";
if (args.length == 1) {
providedPropertyFile = args[0];
}
XSLTTopology XsltTopology = new XSLTTopology(TOPOLOGY_PROPERTIES_FILE, providedPropertyFile);
String topologyName = topologyProperties.getProperty(TopologyPropertyKeys.TOPOLOGY_NAME);
// assuming kafka topic == topology name
String kafkaTopic = topologyName;
String ecloudMcsAddress = topologyProperties.getProperty(TopologyPropertyKeys.MCS_URL);
StormTopology stormTopology = XsltTopology.buildTopology(
kafkaTopic,
ecloudMcsAddress);
config.setNumWorkers(Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.WORKER_COUNT)));
config.setMaxTaskParallelism(
Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.MAX_TASK_PARALLELISM)));
config.put(Config.NIMBUS_THRIFT_PORT,
Integer.parseInt(topologyProperties.getProperty(TopologyPropertyKeys.THRIFT_PORT)));
config.put(topologyProperties.getProperty(TopologyPropertyKeys.INPUT_ZOOKEEPER_ADDRESS),
topologyProperties.getProperty(TopologyPropertyKeys.INPUT_ZOOKEEPER_PORT));
config.put(Config.NIMBUS_SEEDS, Arrays.asList(new String[]{topologyProperties.getProperty(TopologyPropertyKeys.NIMBUS_SEEDS)}));
config.put(Config.STORM_ZOOKEEPER_SERVERS,
Arrays.asList(topologyProperties.getProperty(TopologyPropertyKeys.STORM_ZOOKEEPER_ADDRESS)));
StormSubmitter.submitTopology(topologyName, config, stormTopology);
}
}
}