package water.util; import water.*; import water.exceptions.H2OIllegalArgumentException; import water.exceptions.H2OIllegalValueException; import water.fvec.C0DChunk; import water.fvec.Chunk; import water.fvec.NewChunk; import water.fvec.Vec; import water.nbhm.NonBlockingHashMapLong; import water.parser.BufferedString; import water.parser.Categorical; import java.util.*; public class VecUtils { /** * Create a new {@link Vec} of categorical values from an existing {@link Vec}. * * This method accepts all {@link Vec} types as input. The original Vec is not mutated. * * If src is a categorical {@link Vec}, a copy is returned. * * If src is a numeric {@link Vec}, the values are converted to strings used as domain * values. * * For all other types, an exception is currently thrown. These need to be replaced * with appropriate conversions. * * Throws H2OIllegalArgumentException() if the resulting domain exceeds * Categorical.MAX_CATEGORICAL_COUNT. * * @param src A {@link Vec} whose values will be used as the basis for a new categorical {@link Vec} * @return the resulting categorical Vec */ public static Vec toCategoricalVec(Vec src) { switch (src.get_type()) { case Vec.T_CAT: return src.makeCopy(src.domain()); case Vec.T_NUM: return numericToCategorical(src); case Vec.T_STR: // PUBDEV-2204 return stringToCategorical(src); case Vec.T_TIME: // PUBDEV-2205 throw new H2OIllegalArgumentException("Changing time/date columns to a categorical" + " column has not been implemented yet."); case Vec.T_UUID: throw new H2OIllegalArgumentException("Changing UUID columns to a categorical" + " column has not been implemented yet."); default: throw new H2OIllegalArgumentException("Unrecognized column type " + src.get_type_str() + " given to toCategoricalVec()"); } } /** * Create a new {@link Vec} of categorical values from string {@link Vec}. * * FIXME: implement in more efficient way with Brandon's primitives for BufferedString manipulation * * @param vec a string {@link Vec} * @return a categorical {@link Vec} */ public static Vec stringToCategorical(Vec vec) { final String[] vecDomain = new CollectStringVecDomain().domain(vec); MRTask task = new MRTask() { transient private java.util.HashMap<String, Integer> lookupTable; @Override protected void setupLocal() { lookupTable = new java.util.HashMap<>(vecDomain.length); for (int i = 0; i < vecDomain.length; i++) { // FIXME: boxing lookupTable.put(vecDomain[i], i); } } @Override public void map(Chunk c, NewChunk nc) { BufferedString bs = new BufferedString(); for (int row = 0; row < c.len(); row++) { if (c.isNA(row)) { nc.addNA(); } else { c.atStr(bs, row); nc.addNum(lookupTable.get(bs.bytesToString()), 0); } } } }; // Invoke tasks - one input vector, one ouput vector task.doAll(new byte[] {Vec.T_CAT}, vec); // Return result return task.outputFrame(null, null, new String[][] {vecDomain}).vec(0); } /** * Create a new {@link Vec} of categorical values from a numeric {@link Vec}. * * This currently only ingests a {@link Vec} of integers. * * Handling reals is PUBDEV-2207 * * @param src a numeric {@link Vec} * @return a categorical {@link Vec} */ public static Vec numericToCategorical(Vec src) { if (src.isInt()) { int min = (int) src.min(), max = (int) src.max(); // try to do the fast domain collection long dom[] = (min >= 0 && max < Integer.MAX_VALUE - 4) ? new CollectDomainFast(max).doAll(src).domain() : new CollectDomain().doAll(src).domain(); if (dom.length > Categorical.MAX_CATEGORICAL_COUNT) throw new H2OIllegalArgumentException("Column domain is too large to be represented as an categorical: " + dom.length + " > " + Categorical.MAX_CATEGORICAL_COUNT); return copyOver(src, Vec.T_CAT, dom); } else throw new H2OIllegalArgumentException("Categorical conversion can only currently be applied to integer columns."); } /** * Create a new {@link Vec} of numeric values from an existing {@link Vec}. * * This method accepts all {@link Vec} types as input. The original Vec is not mutated. * * If src is a categorical {@link Vec}, a copy is returned. * * If src is a string {@link Vec}, all values that can be are parsed into reals or integers, and all * others become NA. See stringToNumeric for parsing details. * * If src is a numeric {@link Vec}, a copy is made. * * If src is a time {@link Vec}, the milliseconds since the epoch are used to populate the new Vec. * * If src is a UUID {@link Vec}, the existing numeric storage is used to populate the new Vec. * * Throws H2OIllegalArgumentException() if the resulting domain exceeds * Categorical.MAX_CATEGORICAL_COUNT. * * @param src A {@link Vec} whose values will be used as the basis for a new numeric {@link Vec} * @return the resulting numeric {@link Vec} */ public static Vec toNumericVec(Vec src) { switch (src.get_type()) { case Vec.T_CAT: return categoricalToInt(src); case Vec.T_STR: return stringToNumeric(src); case Vec.T_NUM: case Vec.T_TIME: case Vec.T_UUID: return src.makeCopy(null, Vec.T_NUM); default: throw new H2OIllegalArgumentException("Unrecognized column type " + src.get_type_str() + " given to toNumericVec()"); } } /** * Create a new {@link Vec} of numeric values from a string {@link Vec}. Any rows that cannot be * converted to a number are set to NA. * * Currently only does basic numeric formats. No exponents, or hex values. Doesn't * even like commas or spaces. :( Needs love. Handling more numeric * representations is PUBDEV-2209 * * @param src a string {@link Vec} * @return a numeric {@link Vec} */ public static Vec stringToNumeric(Vec src) { if(!src.isString()) throw new H2OIllegalArgumentException("stringToNumeric conversion only works on string columns"); Vec res = new MRTask() { @Override public void map(Chunk chk, NewChunk newChk){ if (chk instanceof C0DChunk) { // all NAs for (int i=0; i < chk._len; i++) newChk.addNA(); } else { BufferedString tmpStr = new BufferedString(); for (int i=0; i < chk._len; i++) { if (!chk.isNA(i)) { tmpStr = chk.atStr(tmpStr, i); switch (tmpStr.getNumericType()) { case BufferedString.NA: newChk.addNA(); break; case BufferedString.INT: newChk.addNum(Long.parseLong(tmpStr.toString()),0); break; case BufferedString.REAL: newChk.addNum(Double.parseDouble(tmpStr.toString())); break; default: throw new H2OIllegalValueException("Received unexpected type when parsing a string to a number.", this); } } else newChk.addNA(); } } } }.doAll(Vec.T_NUM, src).outputFrame().anyVec(); assert res != null; return res; } /** * Create a new {@link Vec} of numeric values from a categorical {@link Vec}. * * If the first value in the domain of the src Vec is a stringified ints, * then it will use those ints. Otherwise, it will use the raw enumeration level mapping. * If the domain is stringified ints, then all of the domain must be able to be parsed as * an int. If it cannot be parsed as such, a NumberFormatException will be caught and * rethrown as an H2OIllegalArgumentException that declares the illegal domain value. * Otherwise, the this pointer is copied to a new Vec whose domain is null. * * The magic of this method should be eliminated. It should just use enumeration level * maps. If the user wants domains to be used, call categoricalDomainsToNumeric(). * PUBDEV-2209 * * @param src a categorical {@link Vec} * @return a numeric {@link Vec} */ public static Vec categoricalToInt(final Vec src) { if( src.isInt() && (src.domain()==null || src.domain().length == 0)) return copyOver(src, Vec.T_NUM, null); if( !src.isCategorical() ) throw new IllegalArgumentException("categoricalToInt conversion only works on categorical columns."); // check if the 1st lvl of the domain can be parsed as int boolean useDomain=false; Vec newVec = copyOver(src, Vec.T_NUM, null); try { Integer.parseInt(src.domain()[0]); useDomain=true; } catch (NumberFormatException e) { // makeCopy and return... } if( useDomain ) { new MRTask() { @Override public void map(Chunk c) { for (int i=0;i<c._len;++i) if( !c.isNA(i) ) c.set(i, Integer.parseInt(src.domain()[(int)c.at8(i)])); } }.doAll(newVec); } return newVec; } /** * Create a new {@link Vec} of string values from an existing {@link Vec}. * * This method accepts all {@link Vec} types as input. The original Vec is not mutated. * * If src is a string {@link Vec}, a copy of the {@link Vec} is made. * * If src is a categorical {@link Vec}, levels are dropped, and the {@link Vec} only records the string. * * For all numeric {@link Vec}s, the number is converted to a string. * * For all UUID {@link Vec}s, the hex representation is stored as a string. * * @param src A {@link Vec} whose values will be used as the basis for a new string {@link Vec} * @return the resulting string {@link Vec} */ public static Vec toStringVec(Vec src) { switch (src.get_type()) { case Vec.T_STR: return src.makeCopy(); case Vec.T_CAT: return categoricalToStringVec(src); case Vec.T_UUID: return UUIDToStringVec(src); case Vec.T_TIME: case Vec.T_NUM: return numericToStringVec(src); default: throw new H2OIllegalArgumentException("Unrecognized column type " + src.get_type_str() + " given to toStringVec()."); } } /** * Create a new {@link Vec} of string values from a categorical {@link Vec}. * * Transformation is done by a {@link Categorical2StrChkTask} which provides a mapping * between values - without copying the underlying data. * * @param src a categorical {@link Vec} * @return a string {@link Vec} */ public static Vec categoricalToStringVec(Vec src) { if( !src.isCategorical() ) throw new H2OIllegalValueException("Can not convert a non-categorical column" + " using categoricalToStringVec().",src); return new Categorical2StrChkTask(src.domain()).doAll(Vec.T_STR,src).outputFrame().anyVec(); } private static class Categorical2StrChkTask extends MRTask<Categorical2StrChkTask> { final String[] _domain; Categorical2StrChkTask(String[] domain) { _domain=domain; } @Override public void map(Chunk c, NewChunk nc) { for(int i=0;i<c._len;++i) if (!c.isNA(i)) nc.addStr(_domain == null ? "" + c.at8(i) : _domain[(int) c.at8(i)]); else nc.addNA(); } } /** * Create a new {@link Vec} of string values from a numeric {@link Vec}. * * Currently only uses a default pretty printer. Would be better if * it accepted a format string PUBDEV-2211 * * @param src a numeric {@link Vec} * @return a string {@link Vec} */ public static Vec numericToStringVec(Vec src) { if (src.isCategorical() || src.isUUID()) throw new H2OIllegalValueException("Cannot convert a non-numeric column" + " using numericToStringVec() ",src); Vec res = new MRTask() { @Override public void map(Chunk chk, NewChunk newChk) { if (chk instanceof C0DChunk) { // all NAs for (int i=0; i < chk._len; i++) newChk.addNA(); } else { for (int i=0; i < chk._len; i++) { if (!chk.isNA(i)) newChk.addStr(PrettyPrint.number(chk, chk.atd(i), 4)); else newChk.addNA(); } } } }.doAll(Vec.T_STR, src).outputFrame().anyVec(); assert res != null; return res; } /** * Create a new {@link Vec} of string values from a UUID {@link Vec}. * * String {@link Vec} is the standard hexadecimal representations of a UUID. * * @param src a UUID {@link Vec} * @return a string {@link Vec} */ public static Vec UUIDToStringVec(Vec src) { if( !src.isUUID() ) throw new H2OIllegalArgumentException("UUIDToStringVec() conversion only works on UUID columns"); Vec res = new MRTask() { @Override public void map(Chunk chk, NewChunk newChk) { if (chk instanceof C0DChunk) { // all NAs for (int i=0; i < chk._len; i++) newChk.addNA(); } else { for (int i=0; i < chk._len; i++) { if (!chk.isNA(i)) newChk.addStr(PrettyPrint.UUID(chk.at16l(i), chk.at16h(i))); else newChk.addNA(); } } } }.doAll(Vec.T_STR,src).outputFrame().anyVec(); assert res != null; return res; } /** * Create a new {@link Vec} of numeric values from a categorical {@link Vec}. * * Numeric values are generated explicitly from the domain values, and not the * enumeration levels. If a domain value cannot be translated as a number, that * domain and all values for that domain will be NA. * * @param src a categorical {@link Vec} * @return a numeric {@link Vec} */ public static Vec categoricalDomainsToNumeric(final Vec src) { if( !src.isCategorical() ) throw new H2OIllegalArgumentException("categoricalToNumeric() conversion only works on categorical columns"); // check if the 1st lvl of the domain can be parsed as int return new MRTask() { @Override public void map(Chunk c) { for (int i=0;i<c._len;++i) if( !c.isNA(i) ) c.set(i, Integer.parseInt(src.domain()[(int)c.at8(i)])); } }.doAll(Vec.T_NUM, src).outputFrame().anyVec(); } /** Collect numeric domain of given {@link Vec} * A map-reduce task to collect up the unique values of an integer {@link Vec} * and returned as the domain for the {@link Vec}. * */ public static class CollectDomain extends MRTask<CollectDomain> { transient NonBlockingHashMapLong<String> _uniques; @Override protected void setupLocal() { _uniques = new NonBlockingHashMapLong<>(); } @Override public void map(Chunk ys) { for( int row=0; row< ys._len; row++ ) if( !ys.isNA(row) ) _uniques.put(ys.at8(row), ""); } @Override public void reduce(CollectDomain mrt) { if( _uniques != mrt._uniques ) _uniques.putAll(mrt._uniques); } public final AutoBuffer write_impl( AutoBuffer ab ) { return ab.putA8(_uniques==null ? null : _uniques.keySetLong()); } public final CollectDomain read_impl( AutoBuffer ab ) { long ls[] = ab.getA8(); assert _uniques == null || _uniques.size()==0; // Only receiving into an empty (shared) NBHM _uniques = new NonBlockingHashMapLong<>(); if( ls != null ) for( long l : ls ) _uniques.put(l, ""); return this; } @Override public final void copyOver(CollectDomain that) { _uniques = that._uniques; } /** Returns exact numeric domain of given {@link Vec} computed by this task. * The domain is always sorted. Hence: * domain()[0] - minimal domain value * domain()[domain().length-1] - maximal domain value */ public long[] domain() { long[] dom = _uniques.keySetLong(); Arrays.sort(dom); return dom; } } /** * Create a new categorical {@link Vec} with deduplicated domains from a categorical {@link Vec}. * * Categoricals may have the same values after munging, and should have the same domain index in the numerical chunk * representation. Unify categoricals that are the same by remapping their domain indices. * * Could be more efficient with a vec copy and replace domain indices as needed. PUBDEV-2587 */ public static class DomainDedupe extends MRTask<DomainDedupe> { private final HashMap<Integer, Integer> _oldToNewDomainIndex; public DomainDedupe(HashMap<Integer, Integer> oldToNewDomainIndex) {_oldToNewDomainIndex = oldToNewDomainIndex; } @Override public void map(Chunk c, NewChunk nc) { for( int row=0; row < c._len; row++) { if ( !c.isNA(row) ) { int oldDomain = (int) c.at8(row); nc.addNum(_oldToNewDomainIndex.get(oldDomain)); } else { nc.addNA(); } } } public static Vec domainDeduper(Vec vec, HashMap<String, ArrayList<Integer>> substringToOldDomainIndices) { HashMap<Integer, Integer> oldToNewDomainIndex = new HashMap<>(); int newDomainIndex = 0; SortedSet<String> alphabetizedSubstrings = new TreeSet<>(substringToOldDomainIndices.keySet()); for (String sub : alphabetizedSubstrings) { for (int oldDomainIndex : substringToOldDomainIndices.get(sub)) { oldToNewDomainIndex.put(oldDomainIndex, newDomainIndex); } newDomainIndex++; } VecUtils.DomainDedupe domainDedupe = new VecUtils.DomainDedupe(oldToNewDomainIndex); String[][] dom2D = {Arrays.copyOf(alphabetizedSubstrings.toArray(), alphabetizedSubstrings.size(), String[].class)}; return domainDedupe.doAll(new byte[]{Vec.T_CAT}, vec).outputFrame(null, null, dom2D).anyVec(); } } // >11x faster than CollectDomain /** (Optimized for positive ints) Collect numeric domain of given {@link Vec} * A map-reduce task to collect up the unique values of an integer {@link Vec} * and returned as the domain for the {@link Vec}. * */ public static class CollectDomainFast extends MRTask<CollectDomainFast> { private final int _s; private boolean[] _u; private long[] _d; public CollectDomainFast(int s) { _s=s; } @Override protected void setupLocal() { _u= MemoryManager.mallocZ(_s + 1); } @Override public void map(Chunk ys) { for( int row=0; row< ys._len; row++ ) if( !ys.isNA(row) ) _u[(int)ys.at8(row)]=true; } @Override public void reduce(CollectDomainFast mrt) { if( _u != mrt._u ) ArrayUtils.or(_u, mrt._u);} @Override protected void postGlobal() { int c=0; for (boolean b : _u) if(b) c++; _d=MemoryManager.malloc8(c); int id=0; for (int i = 0; i < _u.length;++i) if (_u[i]) _d[id++]=i; Arrays.sort(_d); //is this necessary? } /** Returns exact numeric domain of given {@link Vec} computed by this task. * The domain is always sorted. Hence: * domain()[0] - minimal domain value * domain()[domain().length-1] - maximal domain value */ public long[] domain() { return _d; } } public static void deleteVecs(Vec[] vs, int cnt) { Futures f = new Futures(); for (int i =0; i < cnt; i++) vs[cnt].remove(f); f.blockForPending(); } private static Vec copyOver(Vec src, byte type, long[] domain) { String[][] dom = new String[1][]; dom[0]=domain==null?null:ArrayUtils.toString(domain); return new CPTask(domain).doAll(type, src).outputFrame(null,dom).anyVec(); } private static class CPTask extends MRTask<CPTask> { private final long[] _domain; CPTask(long[] domain) { _domain = domain;} @Override public void map(Chunk c, NewChunk nc) { for(int i=0;i<c._len;++i) { if( c.isNA(i) ) { nc.addNA(); continue; } if( _domain == null ) nc.addNum(c.at8(i)); else { long num = Arrays.binarySearch(_domain,c.at8(i)); // ~24 hits in worst case for 10M levels if( num < 0 ) throw new IllegalArgumentException("Could not find the categorical value!"); nc.addNum(num); } } } } private static class CollectStringVecDomain extends MRTask<CollectStringVecDomain> { private static String PLACEHOLDER = "nothing"; private IcedHashMap<String, IcedInt> _uniques = null; private final IcedInt _placeHolder = new IcedInt(1); @Override protected void setupLocal() { _uniques = new IcedHashMap<>(); } @Override public void map(Chunk c) { BufferedString bs = new BufferedString(); for (int i = 0; i < c.len(); i++) { if (!c.isNA(i)) { c.atStr(bs, i); _uniques.put(bs.bytesToString(), _placeHolder); } } } @Override public void reduce(CollectStringVecDomain mrt) { if (_uniques != mrt._uniques) { // this is not local reduce _uniques.putAll(mrt._uniques); } } public String[] domain(Vec vec) { assert vec.isString() : "String vector expected. Unsupported vector type: " + vec.get_type_str(); this.doAll(vec); return domain(); } public String[] domain() { String[] dom = _uniques.keySet().toArray(new String[_uniques.size()]); Arrays.sort(dom); return dom; } } public static int [] getLocalChunkIds(Vec v){ if(v._cids != null) return v._cids; int [] res = new int[Math.max(v.nChunks()/H2O.CLOUD.size(),1)]; int j = 0; for(int i = 0; i < v.nChunks(); ++i){ if(v.isHomedLocally(i)) { if(res.length == j) res = Arrays.copyOf(res,2*res.length); res[j++] = i; } } return (v._cids = j == res.length?res:Arrays.copyOf(res,j)); } /** * Compute the mean (weighted) response per categorical level * Skip NA values (those are already a separate bucket in the tree building histograms, for which this is designed) */ public static class MeanResponsePerLevelTask extends MRTask<MeanResponsePerLevelTask> { // OUTPUT public double[] meanWeightedResponse; public double meanOverallWeightedResponse; // Internal private double[] wcounts; private int _len; public MeanResponsePerLevelTask(int len) { _len = len; } @Override public void map(Chunk c, Chunk w, Chunk r) { wcounts = new double[_len]; // no larger than 1M elements, so OK to replicate per thread (faster) meanWeightedResponse = new double[_len]; for (int i=0; i<c._len; ++i) { if (c.isNA(i)) continue; int level = (int)c.at8(i); if (w.isNA(i)) continue; double weight = w.atd(i); if (weight == 0) continue; if (r.isNA(i)) continue; double response = r.atd(i); wcounts[level] += weight; meanWeightedResponse[level] += weight*response; } } @Override public void reduce(MeanResponsePerLevelTask mrt) { ArrayUtils.add(wcounts, mrt.wcounts); ArrayUtils.add(meanWeightedResponse, mrt.meanWeightedResponse); mrt.wcounts = null; mrt.meanWeightedResponse = null; } @Override protected void postGlobal() { meanOverallWeightedResponse = 0; double sum = 0; for (int i = 0; i< meanWeightedResponse.length; ++i) { if (wcounts[i] != 0) { meanWeightedResponse[i] = meanWeightedResponse[i] / wcounts[i]; meanOverallWeightedResponse += meanWeightedResponse[i]; sum += wcounts[i]; } } meanOverallWeightedResponse /= sum; } } /** * Reorder an integer (such as Enum storage) Vec using an int -> int mapping */ public static class ReorderTask extends MRTask<ReorderTask> { private int[] _map; public ReorderTask(int[] mapping) { _map = mapping; } @Override public void map(Chunk c, NewChunk nc) { for (int i=0;i<c._len;++i) { if (c.isNA(i)) nc.addNA(); else nc.addNum(_map[(int)c.at8(i)], 0); } } } }