/* # Licensed Materials - Property of IBM # Copyright IBM Corp. 2015 */ package parallel; import static com.ibm.streamsx.topology.file.FileStreams.directoryWatcher; import static com.ibm.streamsx.topology.file.FileStreams.textFileReader; import static com.ibm.streamsx.topology.logic.Value.of; import java.io.ObjectStreamException; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.ibm.streamsx.topology.TStream; import com.ibm.streamsx.topology.Topology; import com.ibm.streamsx.topology.context.StreamsContextFactory; import com.ibm.streamsx.topology.function.Predicate; /** * PartitionedParallelRegexGrep is like ParallelRegexGrep, except that the Java * object in the tuple being passed into the parallel region implements the * Keyable interface, and provides a getKey() function which is used to map * tuples to their corresponding channel in the parallel region. * * Each channel of the parallel region only receives tuples that have the same * hashCode() value of the Key returned by the tuple value's getKey() method. In * other words, for each tuple, the value returned by * * tupleValue.getKey().hashCode() * * will go to the same channel, for each tuple which returns that result. To * show this, instead of passing a java.lang.String into the parallel region, a * stringWrapper class is created that implements the Keyable interface. * * For this sample, if you read from a file that contains the following: * * Apple Orange Banana Banana Apple Apple * * you notice that the lines containing Apple will always be sent to the same * channel of the parallel region; the same for the lines containing Orange and * Banana. * * */ public class PartitionedParallelRegexGrep { static final Logger trace = Logger .getLogger("samples.partitionedparallelregexgrep"); @SuppressWarnings("serial") public static void main(String[] args) throws Exception { String contextType = args[0]; String directory = args[1]; final Pattern pattern = Pattern.compile(args[2]); // Define the topology Topology topology = new Topology("PartitionedParallelRegexGrep"); // All streams with tuples that are Java String objects TStream<String> files = directoryWatcher(topology, directory); TStream<String> lines = textFileReader(files); // Begin parallel region TStream<String> parallelLines = lines .parallel(of(5), TStream.Routing.HASH_PARTITIONED); TStream<String> ParallelFiltered = parallelLines .filter(new Predicate<String>() { @Override public boolean test(String v1) { // If you inspect the output of the streams in this // parallel // region, you will see that any string that is sent to // one // channel will not be sent to another. In other words, // if you // see "apple" being sent to this channel, you will // never see // "apple" being sent to any other channel. trace.info("Testing string \"" + v1 + "\" for the pattern."); // Pass the line through if it matches the // regular expression pattern return matcher.reset(v1).matches(); } transient Matcher matcher; private Object readResolve() throws ObjectStreamException { matcher = pattern.matcher(""); return this; } }); // Combine the results of each parallel filter into one stream, ending // the parallel region. TStream<String> filtered_condensed = ParallelFiltered .endParallel(); // Print the combined results filtered_condensed.print(); // Execute the topology StreamsContextFactory.getStreamsContext(contextType).submit(topology); } }