package water.parser; import java.util.concurrent.atomic.AtomicInteger; import water.Iced; import water.util.IcedHashMap; import water.util.Log; import water.util.PrettyPrint; /** Class for tracking categorical (factor) columns. * * Basically a wrapper around non blocking hash map. * In the first pass, we just collect set of unique strings per column * (if there are less than MAX_CATEGORICAL_COUNT unique elements). * * After pass1, the keys are sorted and indexed alphabetically. * In the second pass, map is used only for lookup and never updated. * * Categorical objects are shared among threads on the local nodes! * * @author tomasnykodym * */ public final class Categorical extends Iced { public static final int MAX_CATEGORICAL_COUNT = 10000000; transient AtomicInteger _id = new AtomicInteger(); int _maxId = -1; volatile IcedHashMap<BufferedString, Integer> _map; boolean maxDomainExceeded = false; Categorical() { _map = new IcedHashMap<>(); } /** Add key to this map (treated as hash set in this case). */ int addKey(BufferedString str) { // _map is shared and be cast to null (if categorical is killed) -> grab local copy IcedHashMap<BufferedString, Integer> m = _map; if( m == null ) return Integer.MAX_VALUE; // Nuked already Integer res = m.get(str); if( res != null ) return res; // Recorded already assert str.length() < 65535; // Length limit so 65535 can be used as a sentinel int newVal = _id.incrementAndGet(); res = m.putIfAbsent(new BufferedString(str), newVal); if( res != null ) return res; if( m.size() > MAX_CATEGORICAL_COUNT) maxDomainExceeded = true; return newVal; } final boolean containsKey(BufferedString key){ return _map.containsKey(key); } @Override public String toString() { return "{"+_map+" }"; } int getTokenId( BufferedString str ) { return _map.get(str); } int maxId() { return _maxId == -1 ? _id.get() : _maxId; } int size() { return _map.size(); } boolean isMapFull() { return maxDomainExceeded; } BufferedString[] getColumnDomain() { return _map.keySet().toArray(new BufferedString[_map.size()]); } public static final int MAX_EXAMPLES = 10; // TODO(Vlad): either make sure it works, or just get rid of it public void convertToUTF8(int col){ int hexConvCnt = 0; BufferedString[] bStrs = _map.keySet().toArray(new BufferedString[_map.size()]); StringBuilder hexSB = new StringBuilder(); for (int i =0; i < bStrs.length; i++) { String s = bStrs[i].toString(); if (!bStrs[i].sameString(s)) { if (s.contains("\uFFFD")) { // make weird chars into hex s = bStrs[i].bytesToString(); if (hexConvCnt++ < MAX_EXAMPLES) hexSB.append(s +", "); if (hexConvCnt == MAX_EXAMPLES) hexSB.append("..."); } int val = _map.get(bStrs[i]); _map.remove(bStrs[i]); bStrs[i] = new BufferedString(s); _map.put(bStrs[i], val); } } if (hexConvCnt > 0) Log.info("Found categoricals with non-UTF-8 characters in the " + PrettyPrint.withOrdinalIndicator(col) + " column. Converting unrecognized characters into hex: " + hexSB.toString()); } // Since this is a *concurrent* hashtable, writing it whilst its being // updated is tricky. If the table is NOT being updated, then all is written // as expected. If the table IS being updated we only promise to write the // Keys that existed at the time the table write began. If elements are // being deleted, they may be written anyways. If the Values are changing, a // random Value is written. // public AutoBuffer write_impl( AutoBuffer ab ) { // if( _map == null ) return ab.put1(1); // Killed map marker // ab.put1(0); // Not killed // ab.put4(maxId()); // for( BufferedString key : _map.keySet() ) // ab.put2((char)key.length()).putA1(key.getBuffer(),key.length()).put4(_map.get(key)); // return ab.put2((char)65535); // End of map marker // } // // public Categorical read_impl( AutoBuffer ab ) { // assert _map == null || _map.size()==0; // _map = null; // if( ab.get1() == 1 ) return this; // Killed? // _maxId = ab.get4(); // _map = new NonBlockingHashMap<>(); // int len; // while( (len = ab.get2()) != 65535 ) // Read until end-of-map marker // _map.put(new BufferedString(ab.getA1(len)),ab.get4()); // return this; // } }