package water.fvec; import water.AutoBuffer; import water.Key; import water.DKV; import water.util.ArrayUtils; import java.util.Arrays; import java.util.HashMap; /** A vector transforming values of given vector according to given domain * mapping - currently only used to transform categorical columns but in theory would * work for any dense-packed Int column. Expected usage is to map from a new * dataset to the domain-mapping expected by a model (which will match the * dataset it was trained on). * * <p>The Vector's domain is the union of the Test and Train domains. * * <p>The mapping is defined by int[] array, size is input Test.domain.length. * Contents refer to values in the Train.domain. Extra values in the Test * domain are sorted after the Train.domain - so mapped values have to be * range-checked (note that returning some flag for NA, say -1, would also * need to be checked for). */ public class CategoricalWrappedVec extends WrappedVec { /** List of values from underlying vector which this vector map to a new * value in the union domain. */ int[] _map; int _p=0; /** Main constructor: convert from one categorical to another */ public CategoricalWrappedVec(Key key, int rowLayout, String[] toDomain, Key masterVecKey) { super(key, rowLayout, masterVecKey); computeMap(masterVec().domain(),toDomain,masterVec().isBad()); DKV.put(this); } /** Constructor just to generate the map and domain; used in tests or when * mixing categorical columns */ private CategoricalWrappedVec(Key key) { super(key, ESPC.rowLayout(key, new long[]{0}), null, null); } public static int[] computeMap(String[] from, String[] to) { Key key = Vec.newKey(); CategoricalWrappedVec tmp = new CategoricalWrappedVec(key); tmp.computeMap(from, to, false); return tmp._map; } @Override public Chunk chunkForChunkIdx(int cidx) { return new CategoricalWrappedChunk(masterVec().chunkForChunkIdx(cidx), this); } /** Compute a mapping from the 'from' domain to the 'to' domain. Strings in * the 'from' domain not in the 'to' domain are mapped past the end of the * 'to' values. Strings in the 'to' domain not in the 'from' domain * simply do not appear in the mapping. The returned map is always the same * length as the 'from' domain. Its contents have values from both * domains; the resulting domain is as big as the largest value in the map, * and only has strings from the 'from' domain (which probably overlap * somewhat with the 'to' domain). * * <p> Example: from={"Blue","Red","Green"}, to={"Green","Yellow","Blue"}.<br> * "Yellow" does not appear in the 'from' domain; "Red" does not appear in the 'to' domain.<br> * Returned map is {2,3,0}.<br> * Map length matches the 'from' domain length.<br> * Largest value is 3, so the domain is size 4.<br> * Domain is: {"Green","Yellow","Blue","Red"}<br> * Extra values in the 'from' domain appear, in-order in the 'from' domain, at the end. * @return mapping */ void computeMap( String[] from, String[] to, boolean fromIsBad ) { // Identity? Build the cheapo non-map if( from==to || Arrays.equals(from,to) ) { _map = ArrayUtils.seq(0,to.length); setDomain(to); return; } // The source Vec does not have a domain, hence is an integer column. The // to[] mapping has the set of unique numbers, we need to map from those // numbers to the index to the numbers. if( from==null ) { setDomain(to); if( fromIsBad ) { _map = new int[0]; return; } int min = Integer.valueOf(to[0]); int max = Integer.valueOf(to[to.length-1]); Vec mvec = masterVec(); if( !(mvec.isInt() && mvec.min() >= min && mvec.max() <= max) ) throw new NumberFormatException(); // Unable to figure out a valid mapping // FIXME this is a bit of a hack to allow adapTo calls to play nice with negative ints in the domain... if( Integer.valueOf(to[0]) < 0 ) { _p=Math.max(0,max); _map = new int[(_p /*positive array of values*/) + (-1*min /*negative array of values*/) + 1 /*one more to store "max" value*/]; for(int i=0;i<to.length;++i) { int v = Integer.valueOf(to[i]); if( v < 0 ) v = -1*v+_p; _map[v] = i; } return; } _map = new int[max+1]; for( int i=0; i<to.length; i++ ) _map[Integer.valueOf(to[i])] = i; return; } // The desired result Vec does not have a domain, hence is a numeric // column. For classification of numbers, we did an original toCategoricalVec // wrapping the numeric values up as Strings for the classes. Unwind that, // converting numeric strings back to their original numbers. _map = new int[from.length]; if( to == null ) { for( int i=0; i<from.length; i++ ) _map[i] = Integer.valueOf(from[i]); return; } // Full string-to-string mapping HashMap<String,Integer> h = new HashMap<>(); for( int i=0; i<to.length; i++ ) h.put(to[i],i); String[] ss = to; int extra = to.length; int actualLen = extra; for( int j=0; j<from.length; j++ ) { Integer x = h.get(from[j]); if( x!=null ) _map[j] = x; else { _map[j] = extra++; if (extra > ss.length) { ss = Arrays.copyOf(ss, 2*ss.length); } ss[extra-1] = from[j]; actualLen = extra; } } setDomain(Arrays.copyOf(ss, actualLen)); } @Override public Vec doCopy() { return new CategoricalWrappedVec(group().addVec(),_rowLayout, domain(), _masterVecKey); } public static class CategoricalWrappedChunk extends Chunk { public final transient Chunk _c; // Test-set map final transient int[] _map; final transient int _p; CategoricalWrappedChunk(Chunk c, CategoricalWrappedVec vec) { _c = c; set_len(_c._len); _start = _c._start; _vec = vec; _cidx = _c._cidx; _map = vec._map; _p = vec._p; } // Returns the mapped value. {@code _map} covers all the values in the // master Chunk, so no AIOOBE. Missing values in the master Chunk return // the usual NaN. @Override protected double atd_impl(int idx) { return _c.isNA_impl(idx) ? Double.NaN : at8_impl(idx); } // Returns the mapped value. {@code _map} covers all the values in the // master Chunk, so no AIOOBE. Missing values in the master Chunk throw // the normal missing-value exception when loading from the master. @Override protected long at8_impl(int idx) { int at8 = (int)_c.at8_impl(idx); if( at8 >= 0 ) return _map[at8]; else return _map[-1*at8+_p]; } // Returns true if the masterVec is missing, false otherwise @Override protected boolean isNA_impl(int idx) { return _c.isNA_impl(idx); } @Override boolean set_impl(int idx, long l) { return false; } @Override boolean set_impl(int idx, double d) { return false; } @Override boolean set_impl(int idx, float f) { return false; } @Override boolean setNA_impl(int idx) { return false; } @Override public ChunkVisitor processRows(ChunkVisitor nc, int from, int to){ for( int i=from; i< to; i++ ) if(isNA(i))nc.addNAs(1); else nc.addValue(at8(i)); return nc; } @Override public ChunkVisitor processRows(ChunkVisitor nc, int... rows){ for( int i:rows) if(isNA(i))nc.addNAs(1); else nc.addValue(at8(i)); return nc; } public static AutoBuffer write_impl(CategoricalWrappedVec v,AutoBuffer bb) { throw water.H2O.fail(); } @Override protected final void initFromBytes () { throw water.H2O.fail(); } @Override public boolean hasNA() { return false; } public Chunk deepCopy() { return extractRows(new NewChunk(this),0,_c._len).compress(); } } }