MapReduce.java example

Explorer
tinkerpop-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.tinkerpop.gremlin.process.computer;

import org.apache.commons.configuration.Configuration;
import org.apache.tinkerpop.gremlin.structure.Graph;
import org.apache.tinkerpop.gremlin.structure.Vertex;

import java.io.Serializable;
import java.lang.reflect.Constructor;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Optional;

/**
 * A MapReduce is composed of map(), combine(), and reduce() stages.
 * The map() stage processes the vertices of the {@link org.apache.tinkerpop.gremlin.structure.Graph} in a logically parallel manner.
 * The combine() stage aggregates the values of a particular map emitted key prior to sending across the cluster.
 * The reduce() stage aggregates the values of the combine/map emitted keys for the keys that hash to the current machine in the cluster.
 * The interface presented here is nearly identical to the interface popularized by Hadoop save the map() is over the vertices of the graph.
 *
 * @author Marko A. Rodriguez (http://markorodriguez.com)
 */
public interface MapReduce<MK, MV, RK, RV, R> extends Cloneable {

    public static final String MAP_REDUCE = "gremlin.mapReduce";

    /**
     * MapReduce is composed of three stages: map, combine, and reduce.
     */
    public static enum Stage {
        MAP, COMBINE, REDUCE
    }

    /**
     * When it is necessary to store the state of a MapReduce job, this method is called.
     * This is typically required when the MapReduce job needs to be serialized to another machine.
     * Note that what is stored is simply the instance state, not any processed data.
     *
     * @param configuration the configuration to store the state of the MapReduce job in.
     */
    public default void storeState(final Configuration configuration) {
        configuration.setProperty(MAP_REDUCE, this.getClass().getName());
    }

    /**
     * When it is necessary to load the state of a MapReduce job, this method is called.
     * This is typically required when the MapReduce job needs to be serialized to another machine.
     * Note that what is loaded is simply the instance state, not any processed data.
     * <p/>
     * It is important that the state loaded from loadState() is identical to any state created from a constructor.
     * For those GraphComputers that do not need to use Configurations to migrate state between JVMs, the constructor will only be used.
     *
     * @param graph         the graph the MapReduce job will run against
     * @param configuration the configuration to load the state of the MapReduce job from.
     */
    public default void loadState(final Graph graph, final Configuration configuration) {

    }

    /**
     * A MapReduce job can be map-only, map-reduce-only, or map-combine-reduce.
     * Before executing the particular stage, this method is called to determine if the respective stage is defined.
     * This method should return true if the respective stage as a non-default method implementation.
     *
     * @param stage the stage to check for definition.
     * @return whether that stage should be executed.
     */
    public boolean doStage(final Stage stage);

    /**
     * The map() method is logically executed at all vertices in the graph in parallel.
     * The map() method emits key/value pairs given some analysis of the data in the vertices (and/or its incident edges).
     * All {@link MapReduce} classes must at least provide an implementation of {@code MapReduce#map(Vertex, MapEmitter)}.
     *
     * @param vertex  the current vertex being map() processed.
     * @param emitter the component that allows for key/value pairs to be emitted to the next stage.
     */
    public void map(final Vertex vertex, final MapEmitter<MK, MV> emitter);

    /**
     * The combine() method is logically executed at all "machines" in parallel.
     * The combine() method pre-combines the values for a key prior to propagation over the wire.
     * The combine() method must emit the same key/value pairs as the reduce() method.
     * If there is a combine() implementation, there must be a reduce() implementation.
     * If the MapReduce implementation is single machine, it can skip executing this method as reduce() is sufficient.
     *
     * @param key     the key that has aggregated values
     * @param values  the aggregated values associated with the key
     * @param emitter the component that allows for key/value pairs to be emitted to the reduce stage.
     */
    public default void combine(final MK key, final Iterator<MV> values, final ReduceEmitter<RK, RV> emitter) {
    }

    /**
     * The reduce() method is logically on the "machine" the respective key hashes to.
     * The reduce() method combines all the values associated with the key and emits key/value pairs.
     *
     * @param key     the key that has aggregated values
     * @param values  the aggregated values associated with the key
     * @param emitter the component that allows for key/value pairs to be emitted as the final result.
     */
    public default void reduce(final MK key, final Iterator<MV> values, final ReduceEmitter<RK, RV> emitter) {
    }

    /**
     * This method is called at the start of the respective {@link MapReduce.Stage} for a particular "chunk of vertices."
     * The set of vertices in the graph are typically not processed with full parallelism.
     * The vertex set is split into subsets and a worker is assigned to call the MapReduce methods on it method.
     * The default implementation is a no-op.
     *
     * @param stage the stage of the MapReduce computation
     */
    public default void workerStart(final Stage stage) {

    }

    /**
     * This method is called at the end of the respective {@link MapReduce.Stage} for a particular "chunk of vertices."
     * The set of vertices in the graph are typically not processed with full parallelism.
     * The vertex set is split into subsets and a worker is assigned to call the MapReduce methods on it method.
     * The default implementation is a no-op.
     *
     * @param stage the stage of the MapReduce computation
     */
    public default void workerEnd(final Stage stage) {

    }

    /**
     * If a {@link Comparator} is provided, then all pairs leaving the {@link MapEmitter} are sorted.
     * The sorted results are either fed sorted to the combine/reduce-stage or as the final output.
     * If sorting is not required, then {@link Optional#empty} should be returned as sorting is computationally expensive.
     * The default implementation returns {@link Optional#empty}.
     *
     * @return an {@link Optional} of a comparator for sorting the map output.
     */
    public default Optional<Comparator<MK>> getMapKeySort() {
        return Optional.empty();
    }

    /**
     * If a {@link Comparator} is provided, then all pairs leaving the {@link ReduceEmitter} are sorted.
     * If sorting is not required, then {@link Optional#empty} should be returned as sorting is computationally expensive.
     * The default implementation returns {@link Optional#empty}.
     *
     * @return an {@link Optional} of a comparator for sorting the reduce output.
     */
    public default Optional<Comparator<RK>> getReduceKeySort() {
        return Optional.empty();
    }

    /**
     * The key/value pairs emitted by reduce() (or map() in a map-only job) can be iterated to generate a local JVM Java object.
     *
     * @param keyValues the key/value pairs that were emitted from reduce() (or map() in a map-only job)
     * @return the resultant object formed from the emitted key/values.
     */
    public R generateFinalResult(final Iterator<KeyValue<RK, RV>> keyValues);

    /**
     * The results of the MapReduce job are associated with a memory-key to ultimately be stored in {@link Memory}.
     *
     * @return the memory key of the generated result object.
     */
    public String getMemoryKey();

    /**
     * The final result can be generated and added to {@link Memory} and accessible via {@link org.apache.tinkerpop.gremlin.process.computer.util.DefaultComputerResult}.
     * The default simply takes the object from generateFinalResult() and adds it to the Memory given getMemoryKey().
     *
     * @param memory    the memory of the {@link GraphComputer}
     * @param keyValues the key/value pairs emitted from reduce() (or map() in a map only job).
     */
    public default void addResultToMemory(final Memory.Admin memory, final Iterator<KeyValue<RK, RV>> keyValues) {
        memory.set(this.getMemoryKey(), this.generateFinalResult(keyValues));
    }

    /**
     * When multiple workers on a single machine need MapReduce instances, it is possible to use clone.
     * This will provide a speedier way of generating instances, over the {@link MapReduce#storeState} and {@link MapReduce#loadState} model.
     * The default implementation simply returns the object as it assumes that the MapReduce instance is a stateless singleton.
     *
     * @return A clone of the MapReduce object
     */
    @SuppressWarnings("CloneDoesntDeclareCloneNotSupportedException")
    public MapReduce<MK, MV, RK, RV, R> clone();

    /**
     * A helper method to construct a {@link MapReduce} given the content of the supplied configuration.
     * The class of the MapReduce is read from the {@link MapReduce#MAP_REDUCE} static configuration key.
     * Once the MapReduce is constructed, {@link MapReduce#loadState} method is called with the provided configuration.
     *
     * @param graph         The graph that the MapReduce job will run against
     * @param configuration A configuration with requisite information to build a MapReduce
     * @return the newly constructed MapReduce
     */
    public static <M extends MapReduce> M createMapReduce(final Graph graph, final Configuration configuration) {
        try {
            final Class<M> mapReduceClass = (Class) Class.forName(configuration.getString(MAP_REDUCE));
            final Constructor<M> constructor = mapReduceClass.getDeclaredConstructor();
            constructor.setAccessible(true);
            final M mapReduce = constructor.newInstance();
            mapReduce.loadState(graph, configuration);
            return mapReduce;
        } catch (final Exception e) {
            throw new IllegalStateException(e.getMessage(), e);
        }
    }

    //////////////////

    /**
     * The MapEmitter is used to emit key/value pairs from the map() stage of the MapReduce job.
     * The implementation of MapEmitter is up to the vendor, not the developer.
     *
     * @param <K> the key type
     * @param <V> the value type
     */
    public interface MapEmitter<K, V> {
        public void emit(final K key, final V value);

        /**
         * A default method that assumes the key is {@link org.apache.tinkerpop.gremlin.process.computer.MapReduce.NullObject}.
         *
         * @param value the value to emit.
         */
        public default void emit(final V value) {
            this.emit((K) MapReduce.NullObject.instance(), value);
        }
    }

    /**
     * The ReduceEmitter is used to emit key/value pairs from the combine() and reduce() stages of the MapReduce job.
     * The implementation of ReduceEmitter is up to the vendor, not the developer.
     *
     * @param <OK> the key type
     * @param <OV> the value type
     */
    public interface ReduceEmitter<OK, OV> {
        public void emit(final OK key, OV value);

        /**
         * A default method that assumes the key is {@link org.apache.tinkerpop.gremlin.process.computer.MapReduce.NullObject}.
         *
         * @param value the value to emit.
         */
        public default void emit(final OV value) {
            this.emit((OK) MapReduce.NullObject.instance(), value);
        }
    }

    //////////////////

    /**
     * A convenience singleton when a single key is needed so that all emitted values converge to the same combiner/reducer.
     */
    public static class NullObject implements Comparable<NullObject>, Serializable {
        private static final NullObject INSTANCE = new NullObject();
        private static final String NULL_OBJECT = "";

        public static NullObject instance() {
            return INSTANCE;
        }

        @Override
        public int hashCode() {
            return -9832049;
        }

        @Override
        public boolean equals(final Object object) {
            return this == object || object instanceof NullObject;
        }

        @Override
        public int compareTo(final NullObject object) {
            return 0;
        }

        @Override
        public String toString() {
            return NULL_OBJECT;
        }
    }
}