package water.util;
import hex.Model;
import hex.ToEigenVec;
import jsr166y.CountedCompleter;
import water.*;
import water.fvec.*;
import water.parser.ParseDataset;
import water.parser.ParseSetup;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URI;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
public class FrameUtils {
/** Parse given file(s) into the form of single frame represented by the given key.
*
* @param okey destination key for parsed frame
* @param files files to parse
* @return a new frame
*/
public static Frame parseFrame(Key okey, File ...files) throws IOException {
if (files == null || files.length == 0) {
throw new IllegalArgumentException("List of files is empty!");
}
for (File f : files) {
if (!f.exists())
throw new FileNotFoundException("File not found " + f);
}
// Create output key if it is not given
if(okey == null) okey = Key.make(files[0].getName());
Key[] inKeys = new Key[files.length];
for (int i=0; i<files.length; i++) inKeys[i] = NFSFileVec.make(files[i])._key;
return ParseDataset.parse(okey, inKeys);
}
/** Parse given set of URIs and produce a frame's key representing output.
*
* @param okey key for ouput frame. Can be null
* @param uris array of URI (file://, hdfs://, s3n://, s3a://, s3://, ...) to parse
* @return a frame which is saved into DKV under okey
* @throws IOException in case of parse error.
*/
public static Frame parseFrame(Key okey, URI ...uris) throws IOException {
return parseFrame(okey, null, uris);
}
public static Frame parseFrame(Key okey, ParseSetup parseSetup, URI ...uris) throws IOException {
if (uris == null || uris.length == 0) {
throw new IllegalArgumentException("List of uris is empty!");
}
if(okey == null) okey = Key.make(uris[0].toString());
Key[] inKeys = new Key[uris.length];
for (int i=0; i<uris.length; i++) inKeys[i] = H2O.getPM().anyURIToKey(uris[i]);
// Return result
return parseSetup != null ? ParseDataset.parse(okey, inKeys, true, ParseSetup.guessSetup(inKeys, parseSetup))
: ParseDataset.parse(okey, inKeys);
}
public static ParseSetup guessParserSetup(ParseSetup userParserSetup, URI ...uris) throws IOException {
Key[] inKeys = new Key[uris.length];
for (int i=0; i<uris.length; i++) inKeys[i] = H2O.getPM().anyURIToKey(uris[i]);
return ParseSetup.guessSetup(inKeys, userParserSetup);
}
public static Frame categoricalEncoder(Frame dataset, String[] skipCols, Model.Parameters.CategoricalEncodingScheme scheme, ToEigenVec tev) {
switch (scheme) {
case AUTO:
case Enum:
case SortByResponse: //the work is done in ModelBuilder - the domain is all we need to change once, adaptTestTrain takes care of test set adaptation
case OneHotInternal:
return dataset; //leave as is - most algos do their own internal default handling of enums
case OneHotExplicit:
return new CategoricalOneHotEncoder(dataset, skipCols).exec().get();
case Binary:
return new CategoricalBinaryEncoder(dataset, skipCols).exec().get();
case Eigen:
return new CategoricalEigenEncoder(tev, dataset, skipCols).exec().get();
case LabelEncoder:
return new CategoricalLabelEncoder(dataset, skipCols).exec().get();
default:
throw H2O.unimpl();
}
}
public static void printTopCategoricalLevels(Frame fr, boolean warn, int topK) {
String[][] domains = fr.domains();
String[] names = fr.names();
int len = domains.length;
int[] levels = new int[len];
for (int i = 0; i < len; ++i)
levels[i] = domains[i] != null ? domains[i].length : 0;
Arrays.sort(levels);
if (levels[len - 1] > 0) {
int levelcutoff = levels[len - 1 - Math.min(topK, len - 1)];
int count = 0;
for (int i = 0; i < len && count < topK; ++i) {
if (domains[i] != null && domains[i].length >= levelcutoff) {
if (warn)
Log.warn("Categorical feature '" + names[i] + "' has cardinality " + domains[i].length + ".");
else
Log.info("Categorical feature '" + names[i] + "' has cardinality " + domains[i].length + ".");
}
count++;
}
}
}
private static class Vec2ArryTsk extends MRTask<Vec2ArryTsk> {
final int N;
public double [] res;
public Vec2ArryTsk(int N){this.N = N;}
@Override public void setupLocal(){
res = MemoryManager.malloc8d(N);
}
@Override public void map(Chunk c){
final int off = (int)c.start();
for(int i = 0; i < c._len; i = c.nextNZ(i))
res[off+i] = c.atd(i);
}
@Override public void reduce(Vec2ArryTsk other){
if(res != other.res) {
for(int i = 0; i < res.length; ++i) {
assert res[i] == 0 || other.res[i] == 0;
res[i] += other.res[i]; // assuming only one nonzero
}
}
}
}
public static double [] asDoubles(Vec v){
if(v.length() > 100000) throw new IllegalArgumentException("Vec is too big to be extracted into array");
return new Vec2ArryTsk((int)v.length()).doAll(v).res;
}
private static class Vec2IntArryTsk extends MRTask<Vec2IntArryTsk> {
final int N;
public int [] res;
public Vec2IntArryTsk(int N){this.N = N;}
@Override public void setupLocal(){
res = MemoryManager.malloc4(N);
}
@Override public void map(Chunk c){
final int off = (int)c.start();
for(int i = 0; i < c._len; i = c.nextNZ(i))
res[off+i] = (int)c.at8(i);
}
@Override public void reduce(Vec2IntArryTsk other){
if(res != other.res) {
for(int i = 0; i < res.length; ++i) {
assert res[i] == 0 || other.res[i] == 0;
res[i] += other.res[i]; // assuming only one nonzero
}
}
}
}
public static int [] asInts(Vec v){
if(v.length() > 100000) throw new IllegalArgumentException("Vec is too big to be extracted into array");
return new Vec2IntArryTsk((int)v.length()).doAll(v).res;
}
/**
* Compute a chunk summary (how many chunks of each type, relative size, total size)
* @param fr
* @return chunk summary
*/
public static ChunkSummary chunkSummary(Frame fr) {
return new ChunkSummary().doAll(fr);
}
/** Generate given numbers of keys by suffixing key by given numbered suffix. */
public static Key[] generateNumKeys(Key mk, int num) { return generateNumKeys(mk, num, "_part"); }
public static Key[] generateNumKeys(Key mk, int num, String delim) {
Key[] ks = new Key[num];
String n = mk!=null ? mk.toString() : "noname";
String suffix = "";
if (n.endsWith(".hex")) {
n = n.substring(0, n.length()-4); // be nice
suffix = ".hex";
}
for (int i=0; i<num; i++) ks[i] = Key.make(n+delim+i+suffix);
return ks;
}
/**
* Helper to insert missing values into a Frame
*/
public static class MissingInserter extends Iced {
Job<Frame> _job;
final Key<Frame> _dataset;
final double _fraction;
final long _seed;
public MissingInserter(Key<Frame> frame, long seed, double frac){
_dataset = frame; _seed = seed; _fraction = frac;
}
/**
* Driver for MissingInserter
*/
class MissingInserterDriver extends H2O.H2OCountedCompleter {
transient final Frame _frame;
MissingInserterDriver(Frame frame) {_frame = frame; }
@Override
public void compute2() {
new MRTask() {
@Override public void map (Chunk[]cs){
final Random rng = RandomUtils.getRNG(0);
for (int c = 0; c < cs.length; c++) {
for (int r = 0; r < cs[c]._len; r++) {
rng.setSeed(_seed + 1234 * c ^ 1723 * (cs[c].start() + r));
if (rng.nextDouble() < _fraction) cs[c].setNA(r);
}
}
_job.update(1);
}
}.doAll(_frame);
tryComplete();
}
}
public Job<Frame> execImpl() {
_job = new Job<>(_dataset, Frame.class.getName(), "MissingValueInserter");
if (DKV.get(_dataset) == null)
throw new IllegalArgumentException("Invalid Frame key " + _dataset + " (Frame doesn't exist).");
if (_fraction < 0 || _fraction > 1 ) throw new IllegalArgumentException("fraction must be between 0 and 1.");
final Frame frame = DKV.getGet(_dataset);
MissingInserterDriver mid = new MissingInserterDriver(frame);
int work = frame.vecs()[0].nChunks();
return _job.start(mid, work);
}
}
/**
* compute fraction of sparse chunks in this array.
* @param chks
* @return
*/
public static double sparseRatio(Chunk [] chks) {
double cnt = 0;
double reg = 1.0/chks.length;
for(Chunk c :chks)
if(c.isSparseNA()){
cnt += c.sparseLenNA()/(double)c.len();
} else if(c.isSparseZero()){
cnt += c.sparseLenZero()/(double)c.len();
} else cnt += 1;
return cnt * reg;
}
public static double sparseRatio(Frame fr) {
double reg = 1.0/fr.numCols();
double res = 0;
for(Vec v:fr.vecs())
res += v.sparseRatio();
return res * reg;
}
public static class WeightedMean extends MRTask<WeightedMean> {
private double _wresponse;
private double _wsum;
public double weightedMean() {
return _wsum == 0 ? 0 : _wresponse / _wsum;
}
@Override public void map(Chunk response, Chunk weight, Chunk offset) {
for (int i=0;i<response._len;++i) {
if (response.isNA(i)) continue;
double w = weight.atd(i);
if (w == 0) continue;
_wresponse += w*(response.atd(i)-offset.atd(i));
_wsum += w;
}
}
@Override public void reduce(WeightedMean mrt) {
_wresponse += mrt._wresponse;
_wsum += mrt._wsum;
}
}
public static class ExportTaskDriver extends H2O.H2OCountedCompleter<ExportTaskDriver> {
private static long DEFAULT_TARGET_PART_SIZE = 134217728L; // 128MB, default HDFS block size
private static int AUTO_PARTS_MAX = 128; // maximum number of parts if automatic determination is enabled
final Frame _frame;
final String _path;
final String _frameName;
final boolean _overwrite;
final Job _j;
int _nParts;
public ExportTaskDriver(Frame frame, String path, String frameName, boolean overwrite, Job j, int nParts) {
_frame = frame;
_path = path;
_frameName = frameName;
_overwrite = overwrite;
_j = j;
_nParts = nParts;
}
@Override
public void compute2() {
_frame.read_lock(_j._key);
if (_nParts == 1) {
// Single file export, the file should be created by the node that was asked to export the data
// (this is for non-distributed filesystems, we want the file to go to the local filesystem of the node)
Frame.CSVStream is = new Frame.CSVStream(_frame, true, false);
exportCSVStream(is, _path, 0);
tryComplete();
} else {
// Multi-part export
if (_nParts < 0) {
_nParts = calculateNParts();
assert _nParts > 0;
}
int nChunksPerPart = ((_frame.anyVec().nChunks() - 1) / _nParts) + 1;
new PartExportTask(this, _frame._names, nChunksPerPart).dfork(_frame);
}
}
@Override
public void onCompletion(CountedCompleter caller) {
_frame.unlock(_j);
}
@Override
public boolean onExceptionalCompletion(Throwable t, CountedCompleter caller) {
_frame.unlock(_j);
return super.onExceptionalCompletion(t, caller);
}
private int calculateNParts() {
EstimateSizeTask estSize = new EstimateSizeTask().dfork(_frame).getResult();
Log.debug("Estimator result: ", estSize);
// the goal is to not to create too small part files (and too many files), ideal part file size is one HDFS block
int nParts = Math.max((int) (estSize._size / DEFAULT_TARGET_PART_SIZE), H2O.CLOUD.size() + 1);
if (nParts > AUTO_PARTS_MAX) {
Log.debug("Recommended number of part files (" + nParts + ") exceeds maximum limit " + AUTO_PARTS_MAX + ". " +
"Number of part files is limited to avoid slow downs when importing back to H2O."); // @tomk
nParts = AUTO_PARTS_MAX;
}
Log.info("For file of estimated size " + estSize + "B determined number of parts: " + _nParts);
return nParts;
}
/**
* Trivial CSV file size estimator. Uses the first line of each non-empty chunk to estimate the size of the chunk.
* The total estimated size is the total of the estimated chunk sizes.
*/
class EstimateSizeTask extends MRTask<EstimateSizeTask> {
// OUT
int _nNonEmpty;
long _size;
@Override
public void map(Chunk[] cs) {
if (cs[0]._len == 0) return;
Frame.CSVStream is = new Frame.CSVStream(cs, null, 1, false);
try {
_nNonEmpty++;
_size += is.getCurrentRowSize() * cs[0]._len;
} catch (IOException e) {
throw new RuntimeException(e);
} finally {
try { is.close(); } catch (Exception e) { Log.err(e); }
}
}
@Override
public void reduce(EstimateSizeTask mrt) {
_nNonEmpty += mrt._nNonEmpty;
_size += mrt._size;
}
@Override
public String toString() {
return "EstimateSizeTask{_nNonEmpty=" + _nNonEmpty + ", _size=" + _size + '}';
}
}
private long copyCSVStream(Frame.CSVStream is, OutputStream os, int firstChkIdx, int buffer_size) throws IOException {
long len = 0;
byte[] bytes = new byte[buffer_size];
int curChkIdx = firstChkIdx;
for (;;) {
int count = is.read(bytes, 0, buffer_size);
if (count <= 0) {
break;
}
len += count;
os.write(bytes, 0, count);
int workDone = is._curChkIdx - curChkIdx;
if (workDone > 0) {
if (_j.stop_requested()) throw new Job.JobCancelledException();
_j.update(workDone);
curChkIdx = is._curChkIdx;
}
}
return len;
}
private void exportCSVStream(Frame.CSVStream is, String path, int firstChkIdx) {
OutputStream os = null;
long written = -1;
try {
os = H2O.getPM().create(path, _overwrite);
written = copyCSVStream(is, os, firstChkIdx, 4 * 1024 * 1024);
} catch (IOException e) {
throw new RuntimeException(e);
} finally {
if (os != null) {
try {
os.flush(); // Seems redundant, but seeing a short-file-read on windows sometimes
os.close();
Log.info("Written " + written + " bytes of key '" + _frameName + "' to " + _path + ".");
} catch (Exception e) {
Log.err(e);
}
}
try { is.close(); } catch (Exception e) { Log.err(e); }
}
}
class PartExportTask extends MRTask<PartExportTask> {
final String[] _colNames;
final int _length;
PartExportTask(H2O.H2OCountedCompleter<?> completer, String[] colNames, int length) {
super(completer);
_colNames = colNames;
_length = length;
}
@Override
public void map(Chunk[] cs) {
Chunk anyChunk = cs[0];
if (anyChunk.cidx() % _length > 0) {
return;
}
int partIdx = anyChunk.cidx() / _length;
String partPath = _path + "/part-m-" + String.valueOf(100000 + partIdx).substring(1);
Frame.CSVStream is = new Frame.CSVStream(cs, _colNames, _length, false);
exportCSVStream(is, partPath, anyChunk.cidx());
}
@Override
protected void setupLocal() {
boolean created = H2O.getPM().mkdirs(_path);
if (! created) Log.warn("Path ", _path, " was not created.");
}
}
}
public static class CategoricalOneHotEncoder extends Iced {
final Frame _frame;
Job<Frame> _job;
final String[] _skipCols;
public CategoricalOneHotEncoder(Frame dataset, String[] skipCols) {
_frame = dataset;
_skipCols = skipCols;
}
/**
* Driver for CategoricalOneHotEncoder
*/
class CategoricalOneHotEncoderDriver extends H2O.H2OCountedCompleter {
final Frame _frame;
final Key<Frame> _destKey;
final String[] _skipCols;
CategoricalOneHotEncoderDriver(Frame frame, Key<Frame> destKey, String[] skipCols) { _frame = frame; _destKey = destKey; _skipCols = skipCols; }
class OneHotConverter extends MRTask<OneHotConverter> {
int[] _categorySizes;
public OneHotConverter(int[] categorySizes) { _categorySizes = categorySizes; }
@Override public void map(Chunk[] cs, NewChunk[] ncs) {
int targetColOffset = 0;
for (int iCol = 0; iCol < cs.length; ++iCol) {
Chunk col = cs[iCol];
int numTargetColumns = _categorySizes[iCol];
for (int iRow = 0; iRow < col._len; ++iRow) {
long val = col.isNA(iRow)? numTargetColumns-1 : col.at8(iRow);
for (int j = 0; j < numTargetColumns; ++j) {
ncs[targetColOffset + j].addNum(val==j ? 1 : 0, 0);
}
}
targetColOffset += numTargetColumns;
}
}
}
@Override public void compute2() {
Vec[] frameVecs = _frame.vecs();
int numCategoricals = 0;
for (int i=0;i<frameVecs.length;++i)
if (frameVecs[i].isCategorical() && ArrayUtils.find(_skipCols, _frame._names[i])==-1)
numCategoricals++;
Vec[] extraVecs = new Vec[_skipCols.length];
for (int i=0; i< extraVecs.length; ++i) {
Vec v = _frame.vec(_skipCols[i]); //can be null
if (v!=null) extraVecs[i] = v;
}
Frame categoricalFrame = new Frame();
Frame outputFrame = new Frame(_destKey);
int[] categorySizes = new int[numCategoricals];
int numOutputColumns = 0;
List<String> catnames= new ArrayList<>();
for (int i = 0, j = 0; i < frameVecs.length; ++i) {
if (ArrayUtils.find(_skipCols, _frame._names[i])>=0) continue;
int numCategories = frameVecs[i].cardinality(); // Returns -1 if non-categorical variable
if (numCategories > 0) {
categoricalFrame.add(_frame.name(i), frameVecs[i]);
categorySizes[j] = numCategories + 1/* for NAs */;
numOutputColumns += categorySizes[j];
for (int k=0;k<categorySizes[j]-1;++k)
catnames.add(_frame.name(i) + "." + _frame.vec(i).domain()[k]);
catnames.add(_frame.name(i) + ".missing(NA)");
++j;
} else {
outputFrame.add(_frame.name(i), frameVecs[i].makeCopy());
}
}
OneHotConverter mrtask = new OneHotConverter(categorySizes);
Frame binaryCols = mrtask.doAll(numOutputColumns, Vec.T_NUM, categoricalFrame).outputFrame();
binaryCols.setNames(catnames.toArray(new String[0]));
outputFrame.add(binaryCols);
for (int i=0;i<extraVecs.length;++i) {
if (extraVecs[i]!=null)
outputFrame.add(_skipCols[i], extraVecs[i].makeCopy());
}
DKV.put(outputFrame);
tryComplete();
}
}
public Job<Frame> exec() {
if (_frame == null)
throw new IllegalArgumentException("Frame doesn't exist.");
Key<Frame> destKey = Key.makeSystem(Key.make().toString());
_job = new Job<>(destKey, Frame.class.getName(), "CategoricalOneHotEncoder");
int workAmount = _frame.lastVec().nChunks();
return _job.start(new CategoricalOneHotEncoderDriver(_frame, destKey, _skipCols), workAmount);
}
}
public static class CategoricalLabelEncoder extends Iced {
final Frame _frame;
Job<Frame> _job;
final String[] _skipCols;
public CategoricalLabelEncoder(Frame dataset, String[] skipCols) {
_frame = dataset;
_skipCols = skipCols;
}
/**
* Driver for CategoricalLabelEncoder
*/
class CategoricalLabelEncoderDriver extends H2O.H2OCountedCompleter {
final Frame _frame;
final Key<Frame> _destKey;
final String[] _skipCols;
CategoricalLabelEncoderDriver(Frame frame, Key<Frame> destKey, String[] skipCols) { _frame = frame; _destKey = destKey; _skipCols = skipCols; }
@Override public void compute2() {
Vec[] frameVecs = _frame.vecs();
Vec[] extraVecs = _skipCols==null?null:new Vec[_skipCols.length];
if (extraVecs!=null) {
for (int i = 0; i < extraVecs.length; ++i) {
Vec v = _frame.vec(_skipCols[i]); //can be null
if (v != null) extraVecs[i] = v;
}
}
Frame outputFrame = new Frame(_destKey);
for (int i = 0, j = 0; i < frameVecs.length; ++i) {
if (_skipCols!=null && ArrayUtils.find(_skipCols, _frame._names[i])>=0) continue;
int numCategories = frameVecs[i].cardinality(); // Returns -1 if non-categorical variable
if (numCategories > 0) {
outputFrame.add(_frame.name(i), frameVecs[i].toNumericVec());
} else
outputFrame.add(_frame.name(i), frameVecs[i].makeCopy());
}
if (_skipCols!=null) {
for (int i = 0; i < extraVecs.length; ++i) {
if (extraVecs[i] != null)
outputFrame.add(_skipCols[i], extraVecs[i].makeCopy());
}
}
DKV.put(outputFrame);
tryComplete();
}
}
public Job<Frame> exec() {
if (_frame == null)
throw new IllegalArgumentException("Frame doesn't exist.");
Key<Frame> destKey = Key.makeSystem(Key.make().toString());
_job = new Job<>(destKey, Frame.class.getName(), "CategoricalLabelEncoder");
int workAmount = _frame.lastVec().nChunks();
return _job.start(new CategoricalLabelEncoderDriver(_frame, destKey, _skipCols), workAmount);
}
}
/**
* Helper to convert a categorical variable into a "binary" encoding format. In this format each categorical value is
* first assigned an integer value, then that integer is written in binary, and each bit column is converted into a
* separate column. This is intended as an improvement to an existing one-hot transformation.
* For each categorical variable we assume that the number of categories is 1 + domain cardinality, the extra
* category is reserved for NAs.
* See http://www.willmcginnis.com/2015/11/29/beyond-one-hot-an-exploration-of-categorical-variables/
*/
public static class CategoricalBinaryEncoder extends Iced {
final Frame _frame;
Job<Frame> _job;
final String[] _skipCols;
public CategoricalBinaryEncoder(Frame dataset, String[] skipCols) {
_frame = dataset;
_skipCols = skipCols;
}
/**
* Driver for CategoricalBinaryEncoder
*/
class CategoricalBinaryEncoderDriver extends H2O.H2OCountedCompleter {
final Frame _frame;
final Key<Frame> _destKey;
final String[] _skipCols;
CategoricalBinaryEncoderDriver(Frame frame, Key<Frame> destKey, String[] skipCols) { _frame = frame; _destKey = destKey; _skipCols = skipCols; }
class BinaryConverter extends MRTask<BinaryConverter> {
int[] _categorySizes;
public BinaryConverter(int[] categorySizes) { _categorySizes = categorySizes; }
@Override public void map(Chunk[] cs, NewChunk[] ncs) {
int targetColOffset = 0;
for (int iCol = 0; iCol < cs.length; ++iCol) {
Chunk col = cs[iCol];
int numTargetColumns = _categorySizes[iCol];
for (int iRow = 0; iRow < col._len; ++iRow) {
long val = col.isNA(iRow)? 0 : 1 + col.at8(iRow);
for (int j = 0; j < numTargetColumns; ++j) {
ncs[targetColOffset + j].addNum(val & 1, 0);
val >>>= 1;
}
assert val == 0 : "";
}
targetColOffset += numTargetColumns;
}
}
}
@Override public void compute2() {
Vec[] frameVecs = _frame.vecs();
int numCategoricals = 0;
for (int i=0;i<frameVecs.length;++i)
if (frameVecs[i].isCategorical() && (_skipCols==null || ArrayUtils.find(_skipCols, _frame._names[i])==-1))
numCategoricals++;
Vec[] extraVecs = _skipCols==null?null:new Vec[_skipCols.length];
if (extraVecs!=null) {
for (int i = 0; i < extraVecs.length; ++i) {
Vec v = _frame.vec(_skipCols[i]); //can be null
if (v != null) extraVecs[i] = v;
}
}
Frame categoricalFrame = new Frame();
Frame outputFrame = new Frame(_destKey);
int[] binaryCategorySizes = new int[numCategoricals];
int numOutputColumns = 0;
for (int i = 0, j = 0; i < frameVecs.length; ++i) {
if (_skipCols!=null && ArrayUtils.find(_skipCols, _frame._names[i])>=0) continue;
int numCategories = frameVecs[i].cardinality(); // Returns -1 if non-categorical variable
if (numCategories > 0) {
categoricalFrame.add(_frame.name(i), frameVecs[i]);
binaryCategorySizes[j] = 1 + MathUtils.log2(numCategories - 1 + 1/* for NAs */);
numOutputColumns += binaryCategorySizes[j];
++j;
} else
outputFrame.add(_frame.name(i), frameVecs[i].makeCopy());
}
BinaryConverter mrtask = new BinaryConverter(binaryCategorySizes);
Frame binaryCols = mrtask.doAll(numOutputColumns, Vec.T_NUM, categoricalFrame).outputFrame();
// change names of binaryCols so that they reflect the original names of the categories
for (int i = 0, j = 0; i < binaryCategorySizes.length; j += binaryCategorySizes[i++]) {
for (int k = 0; k < binaryCategorySizes[i]; ++k) {
binaryCols._names[j + k] = categoricalFrame.name(i) + ":" + k;
}
}
outputFrame.add(binaryCols);
if (_skipCols!=null) {
for (int i = 0; i < extraVecs.length; ++i) {
if (extraVecs[i] != null)
outputFrame.add(_skipCols[i], extraVecs[i].makeCopy());
}
}
DKV.put(outputFrame);
tryComplete();
}
}
public Job<Frame> exec() {
if (_frame == null)
throw new IllegalArgumentException("Frame doesn't exist.");
Key<Frame> destKey = Key.makeSystem(Key.make().toString());
_job = new Job<>(destKey, Frame.class.getName(), "CategoricalBinaryEncoder");
int workAmount = _frame.lastVec().nChunks();
return _job.start(new CategoricalBinaryEncoderDriver(_frame, destKey, _skipCols), workAmount);
}
}
/**
* Helper to convert a categorical variable into the first eigenvector of the dummy-expanded matrix.
*/
public static class CategoricalEigenEncoder {
final Frame _frame;
Job<Frame> _job;
final String[] _skipCols;
final ToEigenVec _tev;
public CategoricalEigenEncoder(ToEigenVec tev, Frame dataset, String[] skipCols) {
_frame = dataset;
_skipCols = skipCols;
_tev = tev;
}
/**
* Driver for CategoricalEigenEncoder
*/
class CategoricalEigenEncoderDriver extends H2O.H2OCountedCompleter {
final Frame _frame;
final Key<Frame> _destKey;
final String[] _skipCols;
final ToEigenVec _tev;
CategoricalEigenEncoderDriver(ToEigenVec tev, Frame frame, Key<Frame> destKey, String[] skipCols) {
_tev = tev; _frame = frame; _destKey = destKey; _skipCols = skipCols;
assert _tev!=null : "Override toEigenVec for this Algo!";
}
@Override public void compute2() {
Vec[] frameVecs = _frame.vecs();
Vec[] extraVecs = new Vec[_skipCols==null?0:_skipCols.length];
for (int i=0; i< extraVecs.length; ++i) {
Vec v = _skipCols==null||_skipCols.length<=i?null:_frame.vec(_skipCols[i]); //can be null
if (v!=null) extraVecs[i] = v;
}
Frame outputFrame = new Frame(_destKey);
for (int i = 0; i < frameVecs.length; ++i) {
if (_skipCols!=null && ArrayUtils.find(_skipCols, _frame._names[i])>=0) continue;
if (frameVecs[i].isCategorical())
outputFrame.add(_frame.name(i) + ".Eigen", _tev.toEigenVec(frameVecs[i]));
else
outputFrame.add(_frame.name(i), frameVecs[i].makeCopy());
}
for (int i=0;i<extraVecs.length;++i) {
if (extraVecs[i]!=null)
outputFrame.add(_skipCols[i], extraVecs[i].makeCopy());
}
DKV.put(outputFrame);
tryComplete();
}
}
public Job<Frame> exec() {
if (_frame == null)
throw new IllegalArgumentException("Frame doesn't exist.");
Key<Frame> destKey = Key.makeSystem(Key.make().toString());
_job = new Job<>(destKey, Frame.class.getName(), "CategoricalEigenEncoder");
int workAmount = _frame.lastVec().nChunks();
return _job.start(new CategoricalEigenEncoderDriver(_tev, _frame, destKey, _skipCols), workAmount);
}
}
static public void cleanUp(IcedHashMap<Key, String> toDelete) {
Futures fs = new Futures();
for (Key k : toDelete.keySet()) {
k.remove(fs);
}
fs.blockForPending();
toDelete.clear();
}
/**
* reduce the domains of all categorical columns to the actually observed subset
* @param frameToModifyInPlace
*/
static public void shrinkDomainsToObservedSubset(Frame frameToModifyInPlace) {
for (Vec v : frameToModifyInPlace.vecs()) {
if (v.isCategorical()) {
long[] uniques = (v.min() >= 0 && v.max() < Integer.MAX_VALUE - 4) ? new VecUtils.CollectDomainFast((int)v.max()).doAll(v).domain() : new VecUtils.CollectDomain().doAll(v).domain();
String[] newDomain = new String[uniques.length];
final int[] fromTo = new int[(int)ArrayUtils.maxValue(uniques)+1];
for (int i=0;i<newDomain.length;++i) {
newDomain[i] = v.domain()[(int) uniques[i]];
fromTo[(int)uniques[i]] = i; //helper for value mapping
}
new MRTask() {
@Override
public void map(Chunk c) {
for (int i=0;i<c._len;++i) {
if (c.isNA(i)) continue;
else c.set(i, fromTo[(int)c.at8(i)]);
}
}
}.doAll(v);
v.setDomain(newDomain);
}
}
}
}