package hex.createframe; import water.*; import water.fvec.*; import water.util.RandomUtils; import java.util.ArrayList; import java.util.Random; /** * <p>This class carries out the frame creation job.</p> * * <p>Frame creation is conceptually done in 3 stages: First, a build "recipe" * is prepared. This recipe is the detailed specification of how the frame is * to be constructed. Second, an MRTask is run that actually creates the frame, * according to the specification in the recipe. In this step all "column * makers" are executed in order they were added, for each chunk-row being * created. Finally, a set of postprocessing steps are performed on the * resulting frame.</p> * * <p>Usage example: * <pre>{@code * Job<Frame> job = new Job<>(destination_key, Frame.class.getName(), "CreateFrame"); * CreateFrameExecutor cfe = new CreateFrameExecutor(job); * cfe.setNumRows(10000); * cfe.setSeed(0xDECAFC0FEE); * cfe.addColumnMaker(new RealColumnCfcm("col0", -1, 1)); * cfe.addColumnMaker(new IntegerColumnCfcm("col1", 0, 100)); * cfe.addPostprocessStep(new MissingInserterCfps(0.05)); * job.start(cfe, cfe.workAmount()); * }</pre></p> */ public class CreateFrameExecutor extends H2O.H2OCountedCompleter<CreateFrameExecutor> { private Job<Frame> job; private ArrayList<CreateFrameColumnMaker> columnMakers; private ArrayList<CreateFramePostprocessStep> postprocessSteps; private int workAmountPerRow; private int workAmountPostprocess; private float bytesPerRow; private int numRows; private int numCols; private long seed; /** * Make a new CreateFrameExecutor. * @param job The {@link Job} instance which is wrapping this executor. This * instance will be used to update it with the current task * progress. */ public CreateFrameExecutor(Job<Frame> job) { this.job = job; columnMakers = new ArrayList<>(10); postprocessSteps = new ArrayList<>(2); seed = -1; } /** * Set number of rows to be created in the resulting frame. (However a * postprocess step may remove some of the rows). */ public void setNumRows(int n) { numRows = n; } /** * Set the seed for the random number generator. Two frames created from the * same seed will be identical. Seed value of -1 (the default) means that a * random seed will be issued. */ public void setSeed(long s) { seed = s; } /** * Add a "column maker" task, responsible for creation of a single (rarely * married or widowed) column. */ public void addColumnMaker(CreateFrameColumnMaker maker) { maker.setIndex(numCols); columnMakers.add(maker); workAmountPerRow += maker.workAmount(); bytesPerRow += maker.byteSizePerRow(); numCols += maker.numColumns(); } /** * Add a step to be performed in the end after the frame has been created. * This step can then modify the frame in any way. */ public void addPostprocessStep(CreateFramePostprocessStep step) { postprocessSteps.add(step); workAmountPostprocess += step.workAmount(); } /** * Return total amount of work that will be performed by the executor. This * is needed externally in the Job execution context to determine the * progress if the task if it is long-running. */ public int workAmount() { return numRows * workAmountPerRow + workAmountPostprocess; } /** * Estimated size of the frame (in bytes), to be used in determining the * optimal chunk size. This estimate may not be absolutely precise. */ public long estimatedByteSize() { return (long)(numRows * bytesPerRow); } //-------------------------------------------------------------------------------------------------------------------- // Private //-------------------------------------------------------------------------------------------------------------------- @Override public void compute2() { int logRowsPerChunk = (int) Math.ceil(Math.log1p(rowsPerChunk())); Vec dummyVec = Vec.makeCon(0, numRows, logRowsPerChunk, false); if (seed == -1) seed = Double.doubleToLongBits(Math.random()); // Create types, names & domains byte[] types = new byte[numCols]; String[] names = new String[numCols]; String[][] domains = new String[numCols][]; int i = 0; for (CreateFrameColumnMaker maker : columnMakers) { int it = 0, in = 0, id = 0; for (byte t : maker.columnTypes()) types[i + it++] = t; for (String n : maker.columnNames()) names[i + in++] = n; String[][] colDomains = maker.columnDomains(); if (colDomains != null) { for (String[] d : colDomains) domains[i + id++] = d; } // otherwise don't do anything and leave those entries in `domains` as nulls. assert in == it && (id == it || id == 0) && it == maker.numColumns(); i += it; } // Make the frame Frame out = new ActualFrameCreator(columnMakers, seed, job) .doAll(types, dummyVec) .outputFrame(job._result, names, domains); // Post-process the frame Random rng = RandomUtils.getRNG(seed + 40245345791L); rng.setSeed(rng.nextLong()); for (CreateFramePostprocessStep step: postprocessSteps) { long nextSeed = rng.nextLong(); step.exec(out, rng); rng.setSeed(nextSeed); job.update(step.workAmount()); } // Clean up DKV.put(out); dummyVec.remove(); tryComplete(); } /** Compute optimal number of rows per chunk in the resulting frame. */ private int rowsPerChunk() { return FileVec.calcOptimalChunkSize( estimatedByteSize(), numCols, numCols * 4, Runtime.getRuntime().availableProcessors(), H2O.getCloudSize(), false, false ); } private static class ActualFrameCreator extends MRTask<ActualFrameCreator> { private long seed; private ArrayList<CreateFrameColumnMaker> columnMakers; private Job<Frame> job; public ActualFrameCreator(ArrayList<CreateFrameColumnMaker> columnMakers, long seed, Job<Frame> job) { this.columnMakers = columnMakers; this.seed = seed; this.job = job; } @Override public void map(Chunk[] cs, NewChunk[] ncs) { if (job.stop_requested()) return; int numRowsInChunk = cs[0]._len; long chunkPosition = cs[0].start(); Random rng = RandomUtils.getRNG(0); long taskIndex = 0; for (CreateFrameColumnMaker colTask : columnMakers) { rng.setSeed(seed + chunkPosition * 138457623L + (taskIndex++) * 967058L); rng.setSeed(rng.nextLong()); colTask.exec(numRowsInChunk, ncs, rng); job.update(colTask.workAmount()); } } } }