PartitionedParallelRegexGrep.java example

Explorer
streamsx.topology-master
/*
# Licensed Materials - Property of IBM
# Copyright IBM Corp. 2015  
 */
package parallel;

import static com.ibm.streamsx.topology.file.FileStreams.directoryWatcher;
import static com.ibm.streamsx.topology.file.FileStreams.textFileReader;
import static com.ibm.streamsx.topology.logic.Value.of;

import java.io.ObjectStreamException;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.ibm.streamsx.topology.TStream;
import com.ibm.streamsx.topology.Topology;
import com.ibm.streamsx.topology.context.StreamsContextFactory;
import com.ibm.streamsx.topology.function.Predicate;

/**
 * PartitionedParallelRegexGrep is like ParallelRegexGrep, except that the Java
 * object in the tuple being passed into the parallel region implements the
 * Keyable interface, and provides a getKey() function which is used to map
 * tuples to their corresponding channel in the parallel region.
 * 
 * Each channel of the parallel region only receives tuples that have the same
 * hashCode() value of the Key returned by the tuple value's getKey() method. In
 * other words, for each tuple, the value returned by
 * 
 * tupleValue.getKey().hashCode()
 * 
 * will go to the same channel, for each tuple which returns that result. To
 * show this, instead of passing a java.lang.String into the parallel region, a
 * stringWrapper class is created that implements the Keyable interface.
 * 
 * For this sample, if you read from a file that contains the following:
 * 
 * Apple Orange Banana Banana Apple Apple
 * 
 * you notice that the lines containing Apple will always be sent to the same
 * channel of the parallel region; the same for the lines containing Orange and
 * Banana.
 * 
 * 
 */
public class PartitionedParallelRegexGrep {
    static final Logger trace = Logger
            .getLogger("samples.partitionedparallelregexgrep");

    @SuppressWarnings("serial")
    public static void main(String[] args) throws Exception {
        String contextType = args[0];
        String directory = args[1];
        final Pattern pattern = Pattern.compile(args[2]);

        // Define the topology
        Topology topology = new Topology("PartitionedParallelRegexGrep");

        // All streams with tuples that are Java String objects
        TStream<String> files = directoryWatcher(topology, directory);
        TStream<String> lines = textFileReader(files);

        // Begin parallel region
        TStream<String> parallelLines = lines
                .parallel(of(5), TStream.Routing.HASH_PARTITIONED);
        TStream<String> ParallelFiltered = parallelLines
                .filter(new Predicate<String>() {

                    @Override
                    public boolean test(String v1) {
                        // If you inspect the output of the streams in this
                        // parallel
                        // region, you will see that any string that is sent to
                        // one
                        // channel will not be sent to another. In other words,
                        // if you
                        // see "apple" being sent to this channel, you will
                        // never see
                        // "apple" being sent to any other channel.
                        trace.info("Testing  string \"" + v1
                                + "\" for the pattern.");
                        // Pass the line through if it matches the
                        // regular expression pattern
                        return matcher.reset(v1).matches();
                    }

                    transient Matcher matcher;

                    private Object readResolve() throws ObjectStreamException {
                        matcher = pattern.matcher("");
                        return this;
                    }
                });

        // Combine the results of each parallel filter into one stream, ending
        // the parallel region.
        TStream<String> filtered_condensed = ParallelFiltered
                .endParallel();

        // Print the combined results
        filtered_condensed.print();

        // Execute the topology
        StreamsContextFactory.getStreamsContext(contextType).submit(topology);
    }
}