package water.util; import hex.Model; import hex.ToEigenVec; import jsr166y.CountedCompleter; import water.*; import water.fvec.*; import water.parser.ParseDataset; import water.parser.ParseSetup; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.OutputStream; import java.net.URI; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Random; public class FrameUtils { /** Parse given file(s) into the form of single frame represented by the given key. * * @param okey destination key for parsed frame * @param files files to parse * @return a new frame */ public static Frame parseFrame(Key okey, File ...files) throws IOException { if (files == null || files.length == 0) { throw new IllegalArgumentException("List of files is empty!"); } for (File f : files) { if (!f.exists()) throw new FileNotFoundException("File not found " + f); } // Create output key if it is not given if(okey == null) okey = Key.make(files[0].getName()); Key[] inKeys = new Key[files.length]; for (int i=0; i<files.length; i++) inKeys[i] = NFSFileVec.make(files[i])._key; return ParseDataset.parse(okey, inKeys); } /** Parse given set of URIs and produce a frame's key representing output. * * @param okey key for ouput frame. Can be null * @param uris array of URI (file://, hdfs://, s3n://, s3a://, s3://, ...) to parse * @return a frame which is saved into DKV under okey * @throws IOException in case of parse error. */ public static Frame parseFrame(Key okey, URI ...uris) throws IOException { return parseFrame(okey, null, uris); } public static Frame parseFrame(Key okey, ParseSetup parseSetup, URI ...uris) throws IOException { if (uris == null || uris.length == 0) { throw new IllegalArgumentException("List of uris is empty!"); } if(okey == null) okey = Key.make(uris[0].toString()); Key[] inKeys = new Key[uris.length]; for (int i=0; i<uris.length; i++) inKeys[i] = H2O.getPM().anyURIToKey(uris[i]); // Return result return parseSetup != null ? ParseDataset.parse(okey, inKeys, true, ParseSetup.guessSetup(inKeys, parseSetup)) : ParseDataset.parse(okey, inKeys); } public static ParseSetup guessParserSetup(ParseSetup userParserSetup, URI ...uris) throws IOException { Key[] inKeys = new Key[uris.length]; for (int i=0; i<uris.length; i++) inKeys[i] = H2O.getPM().anyURIToKey(uris[i]); return ParseSetup.guessSetup(inKeys, userParserSetup); } public static Frame categoricalEncoder(Frame dataset, String[] skipCols, Model.Parameters.CategoricalEncodingScheme scheme, ToEigenVec tev) { switch (scheme) { case AUTO: case Enum: case SortByResponse: //the work is done in ModelBuilder - the domain is all we need to change once, adaptTestTrain takes care of test set adaptation case OneHotInternal: return dataset; //leave as is - most algos do their own internal default handling of enums case OneHotExplicit: return new CategoricalOneHotEncoder(dataset, skipCols).exec().get(); case Binary: return new CategoricalBinaryEncoder(dataset, skipCols).exec().get(); case Eigen: return new CategoricalEigenEncoder(tev, dataset, skipCols).exec().get(); case LabelEncoder: return new CategoricalLabelEncoder(dataset, skipCols).exec().get(); default: throw H2O.unimpl(); } } public static void printTopCategoricalLevels(Frame fr, boolean warn, int topK) { String[][] domains = fr.domains(); String[] names = fr.names(); int len = domains.length; int[] levels = new int[len]; for (int i = 0; i < len; ++i) levels[i] = domains[i] != null ? domains[i].length : 0; Arrays.sort(levels); if (levels[len - 1] > 0) { int levelcutoff = levels[len - 1 - Math.min(topK, len - 1)]; int count = 0; for (int i = 0; i < len && count < topK; ++i) { if (domains[i] != null && domains[i].length >= levelcutoff) { if (warn) Log.warn("Categorical feature '" + names[i] + "' has cardinality " + domains[i].length + "."); else Log.info("Categorical feature '" + names[i] + "' has cardinality " + domains[i].length + "."); } count++; } } } private static class Vec2ArryTsk extends MRTask<Vec2ArryTsk> { final int N; public double [] res; public Vec2ArryTsk(int N){this.N = N;} @Override public void setupLocal(){ res = MemoryManager.malloc8d(N); } @Override public void map(Chunk c){ final int off = (int)c.start(); for(int i = 0; i < c._len; i = c.nextNZ(i)) res[off+i] = c.atd(i); } @Override public void reduce(Vec2ArryTsk other){ if(res != other.res) { for(int i = 0; i < res.length; ++i) { assert res[i] == 0 || other.res[i] == 0; res[i] += other.res[i]; // assuming only one nonzero } } } } public static double [] asDoubles(Vec v){ if(v.length() > 100000) throw new IllegalArgumentException("Vec is too big to be extracted into array"); return new Vec2ArryTsk((int)v.length()).doAll(v).res; } private static class Vec2IntArryTsk extends MRTask<Vec2IntArryTsk> { final int N; public int [] res; public Vec2IntArryTsk(int N){this.N = N;} @Override public void setupLocal(){ res = MemoryManager.malloc4(N); } @Override public void map(Chunk c){ final int off = (int)c.start(); for(int i = 0; i < c._len; i = c.nextNZ(i)) res[off+i] = (int)c.at8(i); } @Override public void reduce(Vec2IntArryTsk other){ if(res != other.res) { for(int i = 0; i < res.length; ++i) { assert res[i] == 0 || other.res[i] == 0; res[i] += other.res[i]; // assuming only one nonzero } } } } public static int [] asInts(Vec v){ if(v.length() > 100000) throw new IllegalArgumentException("Vec is too big to be extracted into array"); return new Vec2IntArryTsk((int)v.length()).doAll(v).res; } /** * Compute a chunk summary (how many chunks of each type, relative size, total size) * @param fr * @return chunk summary */ public static ChunkSummary chunkSummary(Frame fr) { return new ChunkSummary().doAll(fr); } /** Generate given numbers of keys by suffixing key by given numbered suffix. */ public static Key[] generateNumKeys(Key mk, int num) { return generateNumKeys(mk, num, "_part"); } public static Key[] generateNumKeys(Key mk, int num, String delim) { Key[] ks = new Key[num]; String n = mk!=null ? mk.toString() : "noname"; String suffix = ""; if (n.endsWith(".hex")) { n = n.substring(0, n.length()-4); // be nice suffix = ".hex"; } for (int i=0; i<num; i++) ks[i] = Key.make(n+delim+i+suffix); return ks; } /** * Helper to insert missing values into a Frame */ public static class MissingInserter extends Iced { Job<Frame> _job; final Key<Frame> _dataset; final double _fraction; final long _seed; public MissingInserter(Key<Frame> frame, long seed, double frac){ _dataset = frame; _seed = seed; _fraction = frac; } /** * Driver for MissingInserter */ class MissingInserterDriver extends H2O.H2OCountedCompleter { transient final Frame _frame; MissingInserterDriver(Frame frame) {_frame = frame; } @Override public void compute2() { new MRTask() { @Override public void map (Chunk[]cs){ final Random rng = RandomUtils.getRNG(0); for (int c = 0; c < cs.length; c++) { for (int r = 0; r < cs[c]._len; r++) { rng.setSeed(_seed + 1234 * c ^ 1723 * (cs[c].start() + r)); if (rng.nextDouble() < _fraction) cs[c].setNA(r); } } _job.update(1); } }.doAll(_frame); tryComplete(); } } public Job<Frame> execImpl() { _job = new Job<>(_dataset, Frame.class.getName(), "MissingValueInserter"); if (DKV.get(_dataset) == null) throw new IllegalArgumentException("Invalid Frame key " + _dataset + " (Frame doesn't exist)."); if (_fraction < 0 || _fraction > 1 ) throw new IllegalArgumentException("fraction must be between 0 and 1."); final Frame frame = DKV.getGet(_dataset); MissingInserterDriver mid = new MissingInserterDriver(frame); int work = frame.vecs()[0].nChunks(); return _job.start(mid, work); } } /** * compute fraction of sparse chunks in this array. * @param chks * @return */ public static double sparseRatio(Chunk [] chks) { double cnt = 0; double reg = 1.0/chks.length; for(Chunk c :chks) if(c.isSparseNA()){ cnt += c.sparseLenNA()/(double)c.len(); } else if(c.isSparseZero()){ cnt += c.sparseLenZero()/(double)c.len(); } else cnt += 1; return cnt * reg; } public static double sparseRatio(Frame fr) { double reg = 1.0/fr.numCols(); double res = 0; for(Vec v:fr.vecs()) res += v.sparseRatio(); return res * reg; } public static class WeightedMean extends MRTask<WeightedMean> { private double _wresponse; private double _wsum; public double weightedMean() { return _wsum == 0 ? 0 : _wresponse / _wsum; } @Override public void map(Chunk response, Chunk weight, Chunk offset) { for (int i=0;i<response._len;++i) { if (response.isNA(i)) continue; double w = weight.atd(i); if (w == 0) continue; _wresponse += w*(response.atd(i)-offset.atd(i)); _wsum += w; } } @Override public void reduce(WeightedMean mrt) { _wresponse += mrt._wresponse; _wsum += mrt._wsum; } } public static class ExportTaskDriver extends H2O.H2OCountedCompleter<ExportTaskDriver> { private static long DEFAULT_TARGET_PART_SIZE = 134217728L; // 128MB, default HDFS block size private static int AUTO_PARTS_MAX = 128; // maximum number of parts if automatic determination is enabled final Frame _frame; final String _path; final String _frameName; final boolean _overwrite; final Job _j; int _nParts; public ExportTaskDriver(Frame frame, String path, String frameName, boolean overwrite, Job j, int nParts) { _frame = frame; _path = path; _frameName = frameName; _overwrite = overwrite; _j = j; _nParts = nParts; } @Override public void compute2() { _frame.read_lock(_j._key); if (_nParts == 1) { // Single file export, the file should be created by the node that was asked to export the data // (this is for non-distributed filesystems, we want the file to go to the local filesystem of the node) Frame.CSVStream is = new Frame.CSVStream(_frame, true, false); exportCSVStream(is, _path, 0); tryComplete(); } else { // Multi-part export if (_nParts < 0) { _nParts = calculateNParts(); assert _nParts > 0; } int nChunksPerPart = ((_frame.anyVec().nChunks() - 1) / _nParts) + 1; new PartExportTask(this, _frame._names, nChunksPerPart).dfork(_frame); } } @Override public void onCompletion(CountedCompleter caller) { _frame.unlock(_j); } @Override public boolean onExceptionalCompletion(Throwable t, CountedCompleter caller) { _frame.unlock(_j); return super.onExceptionalCompletion(t, caller); } private int calculateNParts() { EstimateSizeTask estSize = new EstimateSizeTask().dfork(_frame).getResult(); Log.debug("Estimator result: ", estSize); // the goal is to not to create too small part files (and too many files), ideal part file size is one HDFS block int nParts = Math.max((int) (estSize._size / DEFAULT_TARGET_PART_SIZE), H2O.CLOUD.size() + 1); if (nParts > AUTO_PARTS_MAX) { Log.debug("Recommended number of part files (" + nParts + ") exceeds maximum limit " + AUTO_PARTS_MAX + ". " + "Number of part files is limited to avoid slow downs when importing back to H2O."); // @tomk nParts = AUTO_PARTS_MAX; } Log.info("For file of estimated size " + estSize + "B determined number of parts: " + _nParts); return nParts; } /** * Trivial CSV file size estimator. Uses the first line of each non-empty chunk to estimate the size of the chunk. * The total estimated size is the total of the estimated chunk sizes. */ class EstimateSizeTask extends MRTask<EstimateSizeTask> { // OUT int _nNonEmpty; long _size; @Override public void map(Chunk[] cs) { if (cs[0]._len == 0) return; Frame.CSVStream is = new Frame.CSVStream(cs, null, 1, false); try { _nNonEmpty++; _size += is.getCurrentRowSize() * cs[0]._len; } catch (IOException e) { throw new RuntimeException(e); } finally { try { is.close(); } catch (Exception e) { Log.err(e); } } } @Override public void reduce(EstimateSizeTask mrt) { _nNonEmpty += mrt._nNonEmpty; _size += mrt._size; } @Override public String toString() { return "EstimateSizeTask{_nNonEmpty=" + _nNonEmpty + ", _size=" + _size + '}'; } } private long copyCSVStream(Frame.CSVStream is, OutputStream os, int firstChkIdx, int buffer_size) throws IOException { long len = 0; byte[] bytes = new byte[buffer_size]; int curChkIdx = firstChkIdx; for (;;) { int count = is.read(bytes, 0, buffer_size); if (count <= 0) { break; } len += count; os.write(bytes, 0, count); int workDone = is._curChkIdx - curChkIdx; if (workDone > 0) { if (_j.stop_requested()) throw new Job.JobCancelledException(); _j.update(workDone); curChkIdx = is._curChkIdx; } } return len; } private void exportCSVStream(Frame.CSVStream is, String path, int firstChkIdx) { OutputStream os = null; long written = -1; try { os = H2O.getPM().create(path, _overwrite); written = copyCSVStream(is, os, firstChkIdx, 4 * 1024 * 1024); } catch (IOException e) { throw new RuntimeException(e); } finally { if (os != null) { try { os.flush(); // Seems redundant, but seeing a short-file-read on windows sometimes os.close(); Log.info("Written " + written + " bytes of key '" + _frameName + "' to " + _path + "."); } catch (Exception e) { Log.err(e); } } try { is.close(); } catch (Exception e) { Log.err(e); } } } class PartExportTask extends MRTask<PartExportTask> { final String[] _colNames; final int _length; PartExportTask(H2O.H2OCountedCompleter<?> completer, String[] colNames, int length) { super(completer); _colNames = colNames; _length = length; } @Override public void map(Chunk[] cs) { Chunk anyChunk = cs[0]; if (anyChunk.cidx() % _length > 0) { return; } int partIdx = anyChunk.cidx() / _length; String partPath = _path + "/part-m-" + String.valueOf(100000 + partIdx).substring(1); Frame.CSVStream is = new Frame.CSVStream(cs, _colNames, _length, false); exportCSVStream(is, partPath, anyChunk.cidx()); } @Override protected void setupLocal() { boolean created = H2O.getPM().mkdirs(_path); if (! created) Log.warn("Path ", _path, " was not created."); } } } public static class CategoricalOneHotEncoder extends Iced { final Frame _frame; Job<Frame> _job; final String[] _skipCols; public CategoricalOneHotEncoder(Frame dataset, String[] skipCols) { _frame = dataset; _skipCols = skipCols; } /** * Driver for CategoricalOneHotEncoder */ class CategoricalOneHotEncoderDriver extends H2O.H2OCountedCompleter { final Frame _frame; final Key<Frame> _destKey; final String[] _skipCols; CategoricalOneHotEncoderDriver(Frame frame, Key<Frame> destKey, String[] skipCols) { _frame = frame; _destKey = destKey; _skipCols = skipCols; } class OneHotConverter extends MRTask<OneHotConverter> { int[] _categorySizes; public OneHotConverter(int[] categorySizes) { _categorySizes = categorySizes; } @Override public void map(Chunk[] cs, NewChunk[] ncs) { int targetColOffset = 0; for (int iCol = 0; iCol < cs.length; ++iCol) { Chunk col = cs[iCol]; int numTargetColumns = _categorySizes[iCol]; for (int iRow = 0; iRow < col._len; ++iRow) { long val = col.isNA(iRow)? numTargetColumns-1 : col.at8(iRow); for (int j = 0; j < numTargetColumns; ++j) { ncs[targetColOffset + j].addNum(val==j ? 1 : 0, 0); } } targetColOffset += numTargetColumns; } } } @Override public void compute2() { Vec[] frameVecs = _frame.vecs(); int numCategoricals = 0; for (int i=0;i<frameVecs.length;++i) if (frameVecs[i].isCategorical() && ArrayUtils.find(_skipCols, _frame._names[i])==-1) numCategoricals++; Vec[] extraVecs = new Vec[_skipCols.length]; for (int i=0; i< extraVecs.length; ++i) { Vec v = _frame.vec(_skipCols[i]); //can be null if (v!=null) extraVecs[i] = v; } Frame categoricalFrame = new Frame(); Frame outputFrame = new Frame(_destKey); int[] categorySizes = new int[numCategoricals]; int numOutputColumns = 0; List<String> catnames= new ArrayList<>(); for (int i = 0, j = 0; i < frameVecs.length; ++i) { if (ArrayUtils.find(_skipCols, _frame._names[i])>=0) continue; int numCategories = frameVecs[i].cardinality(); // Returns -1 if non-categorical variable if (numCategories > 0) { categoricalFrame.add(_frame.name(i), frameVecs[i]); categorySizes[j] = numCategories + 1/* for NAs */; numOutputColumns += categorySizes[j]; for (int k=0;k<categorySizes[j]-1;++k) catnames.add(_frame.name(i) + "." + _frame.vec(i).domain()[k]); catnames.add(_frame.name(i) + ".missing(NA)"); ++j; } else { outputFrame.add(_frame.name(i), frameVecs[i].makeCopy()); } } OneHotConverter mrtask = new OneHotConverter(categorySizes); Frame binaryCols = mrtask.doAll(numOutputColumns, Vec.T_NUM, categoricalFrame).outputFrame(); binaryCols.setNames(catnames.toArray(new String[0])); outputFrame.add(binaryCols); for (int i=0;i<extraVecs.length;++i) { if (extraVecs[i]!=null) outputFrame.add(_skipCols[i], extraVecs[i].makeCopy()); } DKV.put(outputFrame); tryComplete(); } } public Job<Frame> exec() { if (_frame == null) throw new IllegalArgumentException("Frame doesn't exist."); Key<Frame> destKey = Key.makeSystem(Key.make().toString()); _job = new Job<>(destKey, Frame.class.getName(), "CategoricalOneHotEncoder"); int workAmount = _frame.lastVec().nChunks(); return _job.start(new CategoricalOneHotEncoderDriver(_frame, destKey, _skipCols), workAmount); } } public static class CategoricalLabelEncoder extends Iced { final Frame _frame; Job<Frame> _job; final String[] _skipCols; public CategoricalLabelEncoder(Frame dataset, String[] skipCols) { _frame = dataset; _skipCols = skipCols; } /** * Driver for CategoricalLabelEncoder */ class CategoricalLabelEncoderDriver extends H2O.H2OCountedCompleter { final Frame _frame; final Key<Frame> _destKey; final String[] _skipCols; CategoricalLabelEncoderDriver(Frame frame, Key<Frame> destKey, String[] skipCols) { _frame = frame; _destKey = destKey; _skipCols = skipCols; } @Override public void compute2() { Vec[] frameVecs = _frame.vecs(); Vec[] extraVecs = _skipCols==null?null:new Vec[_skipCols.length]; if (extraVecs!=null) { for (int i = 0; i < extraVecs.length; ++i) { Vec v = _frame.vec(_skipCols[i]); //can be null if (v != null) extraVecs[i] = v; } } Frame outputFrame = new Frame(_destKey); for (int i = 0, j = 0; i < frameVecs.length; ++i) { if (_skipCols!=null && ArrayUtils.find(_skipCols, _frame._names[i])>=0) continue; int numCategories = frameVecs[i].cardinality(); // Returns -1 if non-categorical variable if (numCategories > 0) { outputFrame.add(_frame.name(i), frameVecs[i].toNumericVec()); } else outputFrame.add(_frame.name(i), frameVecs[i].makeCopy()); } if (_skipCols!=null) { for (int i = 0; i < extraVecs.length; ++i) { if (extraVecs[i] != null) outputFrame.add(_skipCols[i], extraVecs[i].makeCopy()); } } DKV.put(outputFrame); tryComplete(); } } public Job<Frame> exec() { if (_frame == null) throw new IllegalArgumentException("Frame doesn't exist."); Key<Frame> destKey = Key.makeSystem(Key.make().toString()); _job = new Job<>(destKey, Frame.class.getName(), "CategoricalLabelEncoder"); int workAmount = _frame.lastVec().nChunks(); return _job.start(new CategoricalLabelEncoderDriver(_frame, destKey, _skipCols), workAmount); } } /** * Helper to convert a categorical variable into a "binary" encoding format. In this format each categorical value is * first assigned an integer value, then that integer is written in binary, and each bit column is converted into a * separate column. This is intended as an improvement to an existing one-hot transformation. * For each categorical variable we assume that the number of categories is 1 + domain cardinality, the extra * category is reserved for NAs. * See http://www.willmcginnis.com/2015/11/29/beyond-one-hot-an-exploration-of-categorical-variables/ */ public static class CategoricalBinaryEncoder extends Iced { final Frame _frame; Job<Frame> _job; final String[] _skipCols; public CategoricalBinaryEncoder(Frame dataset, String[] skipCols) { _frame = dataset; _skipCols = skipCols; } /** * Driver for CategoricalBinaryEncoder */ class CategoricalBinaryEncoderDriver extends H2O.H2OCountedCompleter { final Frame _frame; final Key<Frame> _destKey; final String[] _skipCols; CategoricalBinaryEncoderDriver(Frame frame, Key<Frame> destKey, String[] skipCols) { _frame = frame; _destKey = destKey; _skipCols = skipCols; } class BinaryConverter extends MRTask<BinaryConverter> { int[] _categorySizes; public BinaryConverter(int[] categorySizes) { _categorySizes = categorySizes; } @Override public void map(Chunk[] cs, NewChunk[] ncs) { int targetColOffset = 0; for (int iCol = 0; iCol < cs.length; ++iCol) { Chunk col = cs[iCol]; int numTargetColumns = _categorySizes[iCol]; for (int iRow = 0; iRow < col._len; ++iRow) { long val = col.isNA(iRow)? 0 : 1 + col.at8(iRow); for (int j = 0; j < numTargetColumns; ++j) { ncs[targetColOffset + j].addNum(val & 1, 0); val >>>= 1; } assert val == 0 : ""; } targetColOffset += numTargetColumns; } } } @Override public void compute2() { Vec[] frameVecs = _frame.vecs(); int numCategoricals = 0; for (int i=0;i<frameVecs.length;++i) if (frameVecs[i].isCategorical() && (_skipCols==null || ArrayUtils.find(_skipCols, _frame._names[i])==-1)) numCategoricals++; Vec[] extraVecs = _skipCols==null?null:new Vec[_skipCols.length]; if (extraVecs!=null) { for (int i = 0; i < extraVecs.length; ++i) { Vec v = _frame.vec(_skipCols[i]); //can be null if (v != null) extraVecs[i] = v; } } Frame categoricalFrame = new Frame(); Frame outputFrame = new Frame(_destKey); int[] binaryCategorySizes = new int[numCategoricals]; int numOutputColumns = 0; for (int i = 0, j = 0; i < frameVecs.length; ++i) { if (_skipCols!=null && ArrayUtils.find(_skipCols, _frame._names[i])>=0) continue; int numCategories = frameVecs[i].cardinality(); // Returns -1 if non-categorical variable if (numCategories > 0) { categoricalFrame.add(_frame.name(i), frameVecs[i]); binaryCategorySizes[j] = 1 + MathUtils.log2(numCategories - 1 + 1/* for NAs */); numOutputColumns += binaryCategorySizes[j]; ++j; } else outputFrame.add(_frame.name(i), frameVecs[i].makeCopy()); } BinaryConverter mrtask = new BinaryConverter(binaryCategorySizes); Frame binaryCols = mrtask.doAll(numOutputColumns, Vec.T_NUM, categoricalFrame).outputFrame(); // change names of binaryCols so that they reflect the original names of the categories for (int i = 0, j = 0; i < binaryCategorySizes.length; j += binaryCategorySizes[i++]) { for (int k = 0; k < binaryCategorySizes[i]; ++k) { binaryCols._names[j + k] = categoricalFrame.name(i) + ":" + k; } } outputFrame.add(binaryCols); if (_skipCols!=null) { for (int i = 0; i < extraVecs.length; ++i) { if (extraVecs[i] != null) outputFrame.add(_skipCols[i], extraVecs[i].makeCopy()); } } DKV.put(outputFrame); tryComplete(); } } public Job<Frame> exec() { if (_frame == null) throw new IllegalArgumentException("Frame doesn't exist."); Key<Frame> destKey = Key.makeSystem(Key.make().toString()); _job = new Job<>(destKey, Frame.class.getName(), "CategoricalBinaryEncoder"); int workAmount = _frame.lastVec().nChunks(); return _job.start(new CategoricalBinaryEncoderDriver(_frame, destKey, _skipCols), workAmount); } } /** * Helper to convert a categorical variable into the first eigenvector of the dummy-expanded matrix. */ public static class CategoricalEigenEncoder { final Frame _frame; Job<Frame> _job; final String[] _skipCols; final ToEigenVec _tev; public CategoricalEigenEncoder(ToEigenVec tev, Frame dataset, String[] skipCols) { _frame = dataset; _skipCols = skipCols; _tev = tev; } /** * Driver for CategoricalEigenEncoder */ class CategoricalEigenEncoderDriver extends H2O.H2OCountedCompleter { final Frame _frame; final Key<Frame> _destKey; final String[] _skipCols; final ToEigenVec _tev; CategoricalEigenEncoderDriver(ToEigenVec tev, Frame frame, Key<Frame> destKey, String[] skipCols) { _tev = tev; _frame = frame; _destKey = destKey; _skipCols = skipCols; assert _tev!=null : "Override toEigenVec for this Algo!"; } @Override public void compute2() { Vec[] frameVecs = _frame.vecs(); Vec[] extraVecs = new Vec[_skipCols==null?0:_skipCols.length]; for (int i=0; i< extraVecs.length; ++i) { Vec v = _skipCols==null||_skipCols.length<=i?null:_frame.vec(_skipCols[i]); //can be null if (v!=null) extraVecs[i] = v; } Frame outputFrame = new Frame(_destKey); for (int i = 0; i < frameVecs.length; ++i) { if (_skipCols!=null && ArrayUtils.find(_skipCols, _frame._names[i])>=0) continue; if (frameVecs[i].isCategorical()) outputFrame.add(_frame.name(i) + ".Eigen", _tev.toEigenVec(frameVecs[i])); else outputFrame.add(_frame.name(i), frameVecs[i].makeCopy()); } for (int i=0;i<extraVecs.length;++i) { if (extraVecs[i]!=null) outputFrame.add(_skipCols[i], extraVecs[i].makeCopy()); } DKV.put(outputFrame); tryComplete(); } } public Job<Frame> exec() { if (_frame == null) throw new IllegalArgumentException("Frame doesn't exist."); Key<Frame> destKey = Key.makeSystem(Key.make().toString()); _job = new Job<>(destKey, Frame.class.getName(), "CategoricalEigenEncoder"); int workAmount = _frame.lastVec().nChunks(); return _job.start(new CategoricalEigenEncoderDriver(_tev, _frame, destKey, _skipCols), workAmount); } } static public void cleanUp(IcedHashMap<Key, String> toDelete) { Futures fs = new Futures(); for (Key k : toDelete.keySet()) { k.remove(fs); } fs.blockForPending(); toDelete.clear(); } /** * reduce the domains of all categorical columns to the actually observed subset * @param frameToModifyInPlace */ static public void shrinkDomainsToObservedSubset(Frame frameToModifyInPlace) { for (Vec v : frameToModifyInPlace.vecs()) { if (v.isCategorical()) { long[] uniques = (v.min() >= 0 && v.max() < Integer.MAX_VALUE - 4) ? new VecUtils.CollectDomainFast((int)v.max()).doAll(v).domain() : new VecUtils.CollectDomain().doAll(v).domain(); String[] newDomain = new String[uniques.length]; final int[] fromTo = new int[(int)ArrayUtils.maxValue(uniques)+1]; for (int i=0;i<newDomain.length;++i) { newDomain[i] = v.domain()[(int) uniques[i]]; fromTo[(int)uniques[i]] = i; //helper for value mapping } new MRTask() { @Override public void map(Chunk c) { for (int i=0;i<c._len;++i) { if (c.isNA(i)) continue; else c.set(i, fromTo[(int)c.at8(i)]); } } }.doAll(v); v.setDomain(newDomain); } } } }