/* * Copyright (C) 2012 Michael Koppen * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package de.fhb.twitalyse.bolt.status.text; import java.text.SimpleDateFormat; import java.util.Arrays; import java.util.Collection; import java.util.Date; import java.util.List; import java.util.Map; import backtype.storm.task.OutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Tuple; import backtype.storm.tuple.Values; import de.fhb.twitalyse.bolt.redis.BaseRedisBolt; /** * This Bolt analyses the given Twitter Status Text. * * @author Michael Koppen <koppen@fh-brandenburg.de> */ public class SplitStatusTextBolt extends BaseRedisBolt { /** * */ private static final long serialVersionUID = -7734590864277387631L; private Collection<String> ignoreWords; public SplitStatusTextBolt(Collection<String> ignoreList, String host, int port) { super(host, port); this.ignoreWords = ignoreList; } @Override public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { super.prepare(stormConf, context, collector); this.collector = collector; } @Override public void execute(Tuple input) { long id = input.getLong(0); String text = input.getString(1); // Split text text = text.toLowerCase().trim(); List<String> splittedText = Arrays.asList(text.split(" ")); Date today = new Date(); SimpleDateFormat sdf = new SimpleDateFormat("dd_MM_yyyy"); for (String word : splittedText) { word = word.trim(); if (!word.equals("")) { // Saves # of all words this.incr("#words_full"); // Saves # of words of today this.incr("#words_full_" + sdf.format(today)); if (word.length() >= 3 && !ignoreWords.contains(word)) { collector.emit(new Values(id, word)); } } } collector.ack(input); } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(new Fields("id", "word")); } }