package samples.expert;
import org.junit.Assert;
import water.Job;
import water.MRTask2;
import water.deploy.VM;
import water.fvec.Chunk;
import water.fvec.Frame;
import java.io.File;
/**
* Demonstration of H2O's map-reduce API. This task sums the elements of a column.
*/
public class MapReduce extends Job {
public static void main(String[] args) throws Exception {
Class job = MapReduce.class;
samples.launchers.CloudLocal.launch(job, 1);
//samples.launchers.CloudProcess.launch(job, 2);
//samples.launchers.CloudConnect.launch(job, "localhost:54321");
//samples.launchers.CloudRemote.launchIPs(job, "192.168.1.161", "192.168.1.162");
//samples.launchers.CloudRemote.launchEC2(job, 4);
}
@Override
protected void execImpl() {
// Parse a dataset into a Frame, H2O's distributed table-like data structure
File file = new File(VM.h2oFolder(), "smalldata/iris/iris.csv");
Frame frame = samples.expert.Frames.parse(file);
// Create an instance of our custom map-reduce class.
Sum sum = new Sum();
// Any field set before invoking the task will be copied to other instances created
// for local threads, and serialized to remote instances used on remote nodes.
sum.myInput = "blah";
// Launches a distributed fork-join that will create instances of the task, and run
// them in parallel on each chunk of data for this key. In this example, run on only
// on one column, the second one of the frame.
sum.doAll(frame.vecs()[1]);
// At this point, all task instances have been merged by their 'reduce' method. We
// are back to a state where only one instance exist, and it contains the overall sum.
System.out.println("Sum is " + sum.value);
}
static class Sum extends MRTask2<Sum> {
/**
* This field is only set before the task runs, so it will be copied to all instance of the
* task, and remain constant during a run. It can be seen as an input field.
*/
String myInput;
/**
* This field is updated by the task, and needs to be reduced between instances. It can be seen
* as an output field.
*/
double value;
/**
* This method is invoked on each chunk of the distributed data structure.
*/
@Override
public void map(Chunk chunk) {
Assert.assertEquals("blah", myInput);
for( int row = 0; row < chunk._len; row++ )
value += chunk.at0(row);
// Optionally, setting inputs to null if not needed anymore avoids
// their serialization back to the initiating node
myInput = null;
}
/**
* This operation will be invoked for each MRTask, to add together sums for each chunk.
*/
@Override
public void reduce(Sum other) {
value += other.value;
}
}
}