package hex.createframe;
import water.Iced;
import water.fvec.NewChunk;
import java.util.Random;
/**
* Base class for all "column makers" used by the CreateFrameExecutor to construct the frame.
* Typically a subclass would be creating just a single column having a certain type, name
* and the distribution of values. However it is also possible to create a subclass that
* constructs 0 columns (i.e. just modifies the previously constructed ones), or one that
* creates more than 1 columns at once.
*/
public abstract class CreateFrameColumnMaker extends Iced<CreateFrameColumnMaker> {
protected int index;
/**
* Implement this method in a subclass to actually build the columns.
*
* @param nrows Number of rows in the current chunk. If method is creating new columns,
* then it is supposed to add this many rows.
* @param ncs The `NewChunk`s array passed down from the `map()` method in `MRTask`.
* A subclass is expected to know which NewChunks it is allowed to touch,
* usually with the help of the {@link #index} variable.
* @param rng Random number generator that the subclass may use to fill the columns
* randomly. Do NOT use any other random generator as doing so will break
* the reproducibility promise of the CreateFrame service.
*/
public abstract void exec(int nrows, NewChunk[] ncs, Random rng);
/**
* Number of columns described by this column maker. Usually this is 1, however it is possible that some tasks
* may create either 0 columns (i.e. they only modify existing ones), or create several columns at once (for example
* if you're trying to create one-hot encoded categorical).
*/
public int numColumns() {
return 1;
}
/**
* Types of the columns produced by the column maker. The returned array should have
* exactly the same number of elements as given by {@link #numColumns()}.
*/
public abstract byte[] columnTypes();
/**
* Names of the columns produces by this column maker. Should also have the same
* number of elements as given by {@link #numColumns()}.
*/
public abstract String[] columnNames();
/**
* Domains for categorical columns being created (if any).
*/
public String[][] columnDomains() {
return null;
}
//--------------------------------------------------------------------------------------------------------------------
/**
* Index of the first column that this column maker will be creating. This
* method is used by the executor, and the {@link #index} variable it sets
* can be used to determine which columns in the <code>ncs</code> array to
* fill during the {@link #exec(int, NewChunk[], Random)} step.
*/
public void setIndex(int i) {
index = i;
}
/**
* Estimated byte size of a single row created by this column maker. This
* estimate is later used to determine optimal chunk size for the produced
* frame, thus it doesn't have to be very precise.
*/
public float byteSizePerRow() {
return 4;
}
/**
* <p>Relative amount of work this column maker performs to fill a chunk. The
* base amount of 100 corresponds to a method that draws a single random
* number per row and then uses simple arithmetic before adding a value to
* the NewChunk.
* <p>The output will be used to inform te {@link water.Job} about progress
* being made. It needn't be very precise.
*/
public int workAmount() {
return 100;
}
}