/*********************************************************************************************************************** * * Copyright (C) 2010-2014 by the Stratosphere project (http://stratosphere.eu) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. * **********************************************************************************************************************/ package hu.sztaki.stratosphere.workshop.storm.wordcount; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.HashMap; import java.util.Map; import backtype.storm.Config; import backtype.storm.LocalCluster; import backtype.storm.StormSubmitter; import backtype.storm.spout.SpoutOutputCollector; import backtype.storm.task.OutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.topology.TopologyBuilder; import backtype.storm.topology.base.BaseRichBolt; import backtype.storm.topology.base.BaseRichSpout; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Tuple; import backtype.storm.tuple.Values; import eu.stratosphere.streaming.util.PerformanceCounter; public class WordCountTopology { public static class TextSpout extends BaseRichSpout { private static final long serialVersionUID = 1L; private PerformanceCounter performanceCounter; SpoutOutputCollector _collector; private String path; private String counterPath; BufferedReader br = null; private String line = new String(); private Values outRecord = new Values(""); public TextSpout(String path, String counterPath) { this.path = path; this.counterPath = counterPath; } @Override public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) { _collector = collector; this.performanceCounter = new PerformanceCounter("pc", 1000, 1000, 30000, counterPath + "Spout" + context.getThisTaskId()); } @Override public void nextTuple() { try { br = new BufferedReader(new FileReader(path)); line = br.readLine().replaceAll("[\\-\\+\\.\\^:,]", ""); while (line != null) { if (line != "") { for (String word : line.split(" ")) { outRecord.set(0, word); _collector.emit(outRecord); performanceCounter.count(); } } line = br.readLine(); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } @Override public void ack(Object id) { } @Override public void fail(Object id) { } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(new Fields("word")); } } public static class WordCount extends BaseRichBolt { private static final long serialVersionUID = 1L; private PerformanceCounter performanceCounter; OutputCollector _collector; private String counterPath; private Map<String, Integer> wordCounts = new HashMap<String, Integer>(); private String word = ""; private Integer count = 0; private Values outRecord = new Values("", 0); public WordCount(String counterPath) { this.counterPath = counterPath; } @Override public void prepare(Map map, TopologyContext context, OutputCollector collector) { _collector = collector; this.performanceCounter = new PerformanceCounter("pc", 1000, 1000, 30000, counterPath + "Counter" + context.getThisTaskId()); } @Override public void execute(Tuple tuple) { word = tuple.getString(0); if (wordCounts.containsKey(word)) { count = wordCounts.get(word) + 1; wordCounts.put(word, count); } else { count = 1; wordCounts.put(word, 1); } outRecord.set(0, word); outRecord.set(1, count); _collector.emit(outRecord); performanceCounter.count(); } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(new Fields("word", "count")); } } public static class Sink extends BaseRichBolt { private static final long serialVersionUID = 1L; @Override public void prepare(Map map, TopologyContext context, OutputCollector collector) { } @Override public void execute(Tuple tuple) { } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { } } public static void main(String[] args) throws Exception { if (args != null && args.length == 7) { try { boolean runOnCluster = args[0].equals("cluster"); String fileName = args[1]; String counterPath = args[2]; if (!(new File(fileName)).exists()) { throw new FileNotFoundException(); } int spoutParallelism = Integer.parseInt(args[3]); int counterParallelism = Integer.parseInt(args[4]); int sinkParallelism = Integer.parseInt(args[5]); int numberOfWorkers = Integer.parseInt(args[6]); TopologyBuilder builder = new TopologyBuilder(); builder.setSpout("spout", new TextSpout(fileName, counterPath), spoutParallelism); builder.setBolt("count", new WordCount(counterPath), counterParallelism).fieldsGrouping( "spout", new Fields("word")); builder.setBolt("sink", new Sink(), sinkParallelism).shuffleGrouping("count"); Config conf = new Config(); conf.setDebug(false); conf.setNumWorkers(numberOfWorkers); if (runOnCluster) { StormSubmitter.submitTopology("wordcount", conf, builder.createTopology()); } else { // running locally for 50 seconds conf.setMaxTaskParallelism(3); LocalCluster cluster = new LocalCluster(); cluster.submitTopology("word-count", conf, builder.createTopology()); Thread.sleep(50000); cluster.shutdown(); } } catch (NumberFormatException e) { printUsage(); } catch (FileNotFoundException e) { printUsage(); } } else { printUsage(); } } private static void printUsage() { System.out .println("USAGE:\n run <local/cluster> <performance counter path> <source file> <spout parallelism> <counter parallelism> <sink parallelism> <number of workers>"); } }