/* # Licensed Materials - Property of IBM # Copyright IBM Corp. 2015 */ package twitter; import static com.ibm.streamsx.topology.file.FileStreams.directoryWatcher; import static com.ibm.streamsx.topology.file.FileStreams.textFileReader; import java.io.ObjectStreamException; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.ibm.streamsx.topology.TStream; import com.ibm.streamsx.topology.Topology; import com.ibm.streamsx.topology.context.StreamsContextFactory; import com.ibm.streamsx.topology.function.Function; /** * Sample twitter trending topology application. This Java application builds a * topology that reads from a file of tweets, extracts the hashtags from each * line, and uses a window to keep track of the most popular hashtags from the * past 40,000 tweets. * * <br><br> * * Although the application reads from a file, in principle it could be attached * to a live data source. * * <BR> * <P> * If no arguments are provided then the topology is executed in embedded mode, * within this JVM. * <BR> * This may be executed from the {@code samples/java/functional} directory as: * <UL> * <LI>{@code ant run.twitter.trending} - Using Apache Ant, this will run in embedded * mode and assumes tweets are in CSV files in {@code $HOME/tweets}.</LI> * <LI> * {@code java -cp functionalsamples.jar:../../../com.ibm.streamsx.topology/lib/com.ibm.streamsx.topology.jar:$STREAMS_INSTALL/lib/com.ibm.streams.operator.samples.jar * twitter.TwitterTrending CONTEXT_TYPE DIRECTORY * } - Run directly from the command line. * </LI> * <i>CONTEXT_TYPE</i> is one of: * <UL> * <LI>{@code DISTRIBUTED} - Run as an IBM Streams distributed * application.</LI> * <LI>{@code STANDALONE} - Run as an IBM Streams standalone * application.</LI> * <LI>{@code EMBEDDED} - Run embedded within this JVM.</LI> * <LI>{@code BUNDLE} - Create an IBM Streams application bundle.</LI> * <LI>{@code TOOLKIT} - Create an IBM Streams application toolkit.</LI> * </UL> * and <i>DIRECTORY</i> is the location of a directory that contains one or more * text files containing lines of tweets. * </LI> * <LI> * An application execution within your IDE once you set the class path to include the correct jars.</LI> * </UL> * </P> */ public class TwitterTrending { private static final Pattern TAG_PATTERN = Pattern .compile("(?:^|\\s|[\\p{Punct}&&[^/]])(#[\\p{L}0-9-_]+)"); @SuppressWarnings("serial") public static void main(String args[]) throws Exception { if(args.length == 0){ throw new IllegalArgumentException("Must supply CONTEXT_TYPE and DIRECTORY as arguments"); } String contextType = args[0]; String directory = args[1]; // Define the topology Topology topology = new Topology("twitterPipeline"); // Stream containing file with tweets TStream<String> files = directoryWatcher(topology, directory); // Create a stream of lines from each file. TStream<String> lines = textFileReader(files); // Extract the hashtags from the string TStream<String> hashtags = lines.multiTransform( new Function<String, Iterable<String>>() { @Override public Iterable<String> apply(String v1) { ArrayList<String> tweetHashTags = new ArrayList<String>(); matcher.reset(v1); while (matcher.find()) { tweetHashTags.add(matcher.group(1)); } return tweetHashTags; } transient Matcher matcher; private Object readResolve() throws ObjectStreamException { matcher = TAG_PATTERN.matcher(""); return this; } }); // Extract the most frequent hashtags TStream<List<HashTagCount>> hashTagMap = hashtags.last(40000).aggregate( new Function<List<String>, List<HashTagCount>>() { @Override public List<HashTagCount> apply(List<String> v1) { Trender tre = new Trender(); for (String s_iter : v1) { tre.add(s_iter); } return tre.getTopTen(); } }); hashTagMap.print(); StreamsContextFactory.getStreamsContext(contextType).submit(topology); } }