package water.fvec; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.zip.*; import jsr166y.CountedCompleter; import jsr166y.ForkJoinTask; import jsr166y.ForkJoinWorkerThread; import jsr166y.RecursiveAction; import water.*; import water.H2O.H2OCountedCompleter; import water.fvec.Vec.VectorGroup; import water.nbhm.NonBlockingHashMap; import water.nbhm.NonBlockingSetInt; import water.parser.*; import water.parser.CustomParser.DataOut; import water.parser.CustomParser.ParserSetup; import water.parser.CustomParser.ParserType; import water.parser.CustomParser.StreamDataOut; import water.parser.Enum; import water.util.FrameUtils; import water.util.Log; import water.util.Utils.IcedHashMap; import water.util.Utils.IcedInt; import water.util.Utils; public final class ParseDataset2 extends Job { public final Key _progress; // Job progress Key private MultiFileParseTask _mfpt; // Access to partially built vectors for cleanup after parser crash public static enum Compression { NONE, ZIP, GZIP } public static Key [] filterEmptyFiles(Key [] keys){ Arrays.sort(keys); // first check if there are any empty files and if so remove them Vec [] vecs = new Vec [keys.length]; int c = 0; for(int i = 0; i < vecs.length; ++i) { vecs[i] = getVec(keys[i]); if(vecs[i].length() == 0) c++; } if(c > 0){ // filter out empty files Key[] ks = new Key[keys.length-c]; Vec[] vs = new Vec[vecs.length-c]; int j = 0; for(int i = 0; i < keys.length; ++i) if(vecs[i].length() != 0){ ks[j] = keys[i]; vs[j] = vecs[i]; ++j; } keys = ks; } return keys; } // -------------------------------------------------------------------------- // Parse an array of csv input/file keys into an array of distributed output Vecs public static Frame parse(Key okey, Key [] keys) { return parse(okey,keys,new GuessSetup.GuessSetupTsk(new ParserSetup(),true).invoke(keys)._gSetup._setup,true); } public static Frame parse(Key okey, Key[] keys, CustomParser.ParserSetup globalSetup, boolean delete_on_done) { if( globalSetup._ncols == 0 ) throw new java.lang.IllegalArgumentException(globalSetup.toString()); return forkParseDataset(okey, keys, globalSetup, delete_on_done).get(); } // Same parse, as a backgroundable Job public static ParseDataset2 forkParseDataset(final Key dest, Key[] keys, final CustomParser.ParserSetup setup, boolean delete_on_done) { keys = filterEmptyFiles(keys); setup.checkDupColumnNames(); // Some quick sanity checks: no overwriting your input key, and a resource check. long sum=0; for( Key k : keys ) { if( dest.equals(k) ) throw new IllegalArgumentException("Destination key "+dest+" must be different from all sources"); sum += DKV.get(k).length(); // Sum of all input filesizes } long memsz=0; // Cluster memory for( H2ONode h2o : H2O.CLOUD._memary ) memsz += h2o.get_max_mem(); if( sum > memsz*4 ) throw new IllegalArgumentException("Total input file size of "+PrettyPrint.bytes(sum)+" is much larger than total cluster memory of "+PrettyPrint.bytes(memsz)+", please use either a larger cluster or smaller data."); ParseDataset2 job = new ParseDataset2(dest, keys); new Frame(job.dest(),new String[0],new Vec[0]).delete_and_lock(job.self()); // Lock BEFORE returning for( Key k : keys ) Lockable.read_lock(k,job.self()); // Lock BEFORE returning ParserFJTask fjt = new ParserFJTask(job, keys, setup, delete_on_done); // Fire off background parse // Make a wrapper class that only *starts* when the ParserFJTask fjt // completes - especially it only starts even when fjt completes // exceptionally... thus the fjt onExceptionalCompletion code runs // completely before this empty task starts - providing a simple barrier. // Threads blocking on the job will block on the "cleanup" task, which will // block until the fjt runs the onCompletion or onExceptionCompletion code. H2OCountedCompleter cleanup = new H2OCountedCompleter() { @Override public void compute2() { } @Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) { return true; } }; fjt.setCompleter(cleanup); job.start(cleanup); H2O.submitTask(fjt); return job; } // Setup a private background parse job private ParseDataset2(Key dest, Key[] fkeys) { destination_key = dest; // Job progress Key _progress = Key.make((byte) 0, Key.JOB); UKV.put(_progress, ParseProgress.make(fkeys)); } // Simple internal class doing background parsing, with trackable Job status public static class ParserFJTask extends H2OCountedCompleter { final ParseDataset2 _job; Key[] _keys; CustomParser.ParserSetup _setup; boolean _delete_on_done; public ParserFJTask( ParseDataset2 job, Key[] keys, CustomParser.ParserSetup setup, boolean delete_on_done) { _job = job; _keys = keys; _setup = setup; _delete_on_done = delete_on_done; } @Override public void compute2() { parse_impl(_job, _keys, _setup, _delete_on_done); tryComplete(); } // Took a crash/NPE somewhere in the parser. Attempt cleanup. @Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller){ Futures fs = new Futures(); if( _job != null ) { UKV.remove(_job.destination_key,fs); UKV.remove(_job._progress,fs); // Find & remove all partially-built output vecs & chunks if( _job._mfpt != null ) _job._mfpt.onExceptionCleanup(fs); } // Assume the input is corrupt - or already partially deleted after // parsing. Nuke it all - no partial Vecs lying around. for( Key k : _keys ) UKV.remove(k,fs); fs.blockForPending(); // As soon as the job is canceled, threads blocking on the job will // wake up. Better have all cleanup done first! if( _job != null ) _job.cancel(ex); return true; } } // -------------------------------------------------------------------------- // Parser progress static class ParseProgress extends Iced { final long _total; long _value; DException _ex; ParseProgress(long val, long total){_value = val; _total = total;} // Total number of steps is equal to total bytecount across files static ParseProgress make( Key[] fkeys ) { long total = 0; for( Key fkey : fkeys ) total += getVec(fkey).length(); return new ParseProgress(0,total); } public void setException(DException ex){_ex = ex;} public DException getException(){return _ex;} } static void onProgress(final long len, final Key progress) { new TAtomic<ParseProgress>() { @Override public ParseProgress atomic(ParseProgress old) { if (old == null) return null; old._value += len; return old; } }.fork(progress); } @Override public float progress() { ParseProgress progress = UKV.get(_progress); if( progress == null || progress._total == 0 ) return 0; return progress._value / (float) progress._total; } @Override public void remove() { DKV.remove(_progress); super.remove(); } /** Task to update enum values to match the global numbering scheme. * Performs update in place so that values originally numbered using * node-local unordered numbering will be numbered using global numbering. * @author tomasnykodym */ private static class EnumUpdateTask extends MRTask2<EnumUpdateTask> { private transient int[][][] _emap; final Key _eKey; private final ValueString [][] _gDomain; private final Enum [][] _lEnums; private final int [] _chunk2Enum; private final int [] _colIds; private EnumUpdateTask(ValueString [][] gDomain, Enum [][] lEnums, int [] chunk2Enum, Key lDomKey, int [] colIds){ _gDomain = gDomain; _lEnums = lEnums; _chunk2Enum = chunk2Enum; _eKey = lDomKey; _colIds = colIds; } private int[][] emap(int nodeId) { if( _emap == null ) _emap = new int[_lEnums.length][][]; if( _emap[nodeId] == null ) { int[][] emap = new int[_gDomain.length][]; for( int i = 0; i < _gDomain.length; ++i ) { if( _gDomain[i] != null ) { assert _lEnums[nodeId] != null : "missing lEnum of node " + nodeId + ", enums = " + Arrays.toString(_lEnums); final Enum e = _lEnums[nodeId][_colIds[i]]; emap[i] = new int[e.maxId()+1]; Arrays.fill(emap[i], -1); for(int j = 0; j < _gDomain[i].length; ++j) { ValueString vs = _gDomain[i][j]; if( e.containsKey(vs) ) { assert e.getTokenId(vs) <= e.maxId():"maxIdx = " + e.maxId() + ", got " + e.getTokenId(vs); emap[i][e.getTokenId(vs)] = j; } } } } _emap[nodeId] = emap; } return _emap[nodeId]; } @Override public void map(Chunk [] chks){ int[][] emap = emap(_chunk2Enum[chks[0].cidx()]); final int cidx = chks[0].cidx(); for(int i = 0; i < chks.length; ++i) { Chunk chk = chks[i]; if(_gDomain[i] == null) // killed, replace with all NAs DKV.put(chk._vec.chunkKey(chk.cidx()),new C0DChunk(Double.NaN,chk._len)); else for( int j = 0; j < chk._len; ++j){ if( chk.isNA0(j) )continue; long l = chk.at80(j); if (l < 0 || l >= emap[i].length) reportBrokenEnum(chk, i, j, l, emap); if(emap[i][(int)l] < 0) throw new RuntimeException(H2O.SELF.toString() + ": missing enum at col:" + i + ", line: " + j + ", val = " + l + ", chunk=" + chk.getClass().getSimpleName()); chk.set0(j, emap[i][(int)l]); } chk.close(cidx, _fs); } } private void reportBrokenEnum( Chunk chk, int i, int j, long l, int[][] emap ) { Chunk chk2 = chk._chk2; chk._chk2 = null; StringBuilder sb = new StringBuilder("Enum renumber task, column # " + i + ": Found OOB index " + l + " (expected 0 - " + emap[i].length + ", global domain has " + _gDomain[i].length + " levels) pulled from " + chk.getClass().getSimpleName() + "\n"); int k = 0; for(; k < Math.min(5,chk._len); ++k) sb.append("at8[" + (k+chk._start) + "] = " + chk.at80(k) + ", chk2 = " + (chk2 != null?chk2.at80(k):"") + "\n"); k = Math.max(k,j-2); sb.append("...\n"); for(; k < Math.min(chk._len,j+2); ++k) sb.append("at8[" + (k+chk._start) + "] = " + chk.at80(k) + ", chk2 = " + (chk2 != null?chk2.at80(k):"") + "\n"); sb.append("...\n"); k = Math.max(k,chk._len-5); for(; k < chk._len; ++k) sb.append("at8[" + (k+chk._start) + "] = " + chk.at80(k) + ", chk2 = " + (chk2 != null?chk2.at80(k):"") + "\n"); throw new RuntimeException(sb.toString()); } } // -------------------------------------------------------------------------- private static class EnumFetchTask extends MRTask<EnumFetchTask> { private final Key _k; private final int[] _ecols; private final int _homeNode; // node where the computation started, enum from this node MUST be cloned! private Enum[] _gEnums; // global enums per column private Enum[][] _lEnums; // local enums per node per column private EnumFetchTask(int homeNode, Key k, int[] ecols){_homeNode = homeNode; _k = k;_ecols = ecols;} @Override public void map(Key key) { _lEnums = new Enum[H2O.CLOUD.size()][]; if(MultiFileParseTask._enums.containsKey(_k)){ _lEnums[H2O.SELF.index()] = _gEnums = MultiFileParseTask._enums.get(_k); // if we are the original node (i.e. there will be no sending over // wire), we have to clone the enums not to share the same object // (causes problems when computing column domain and renumbering maps). if( H2O.SELF.index() == _homeNode ) { _gEnums = _gEnums.clone(); for(int i = 0; i < _gEnums.length; ++i) _gEnums[i] = _gEnums[i].clone(); } MultiFileParseTask._enums.remove(_k); } } @Override public void reduce(EnumFetchTask etk) { if(_gEnums == null) { _gEnums = etk._gEnums; _lEnums = etk._lEnums; } else if (etk._gEnums != null) { for( int i : _ecols ) _gEnums[i].merge(etk._gEnums[i]); for( int i = 0; i < _lEnums.length; ++i ) if( _lEnums[i] == null ) _lEnums[i] = etk._lEnums[i]; else assert etk._lEnums[i] == null; } } } // -------------------------------------------------------------------------- // Run once on all nodes; fill in missing zero chunks private static class SVFTask extends MRTask<SVFTask> { private final Frame _f; private SVFTask( Frame f ) { _f = f; } @Override public void map(Key key) { Vec v0 = _f.anyVec(); ArrayList<RecursiveAction> rs = new ArrayList<RecursiveAction>(); for( int i = 0; i < v0.nChunks(); ++i ) { if( !v0.chunkKey(i).home() ) continue; final int fi = i; rs.add(new RecursiveAction() { @Override protected void compute() { // First find the nrows as the # rows of non-missing chunks; done on // locally-homed chunks only - to keep the data distribution. int nlines = 0; for( Vec vec : _f.vecs() ) { Value val = H2O.get(vec.chunkKey(fi)); // Local-get only if( val != null ) { nlines = ((Chunk)val.get())._len; break; } } final int fnlines = nlines; // Now fill in appropriate-sized zero chunks for(int j = 0; j < _f.numCols(); ++j) { Vec vec = _f.vec(j); Key k = vec.chunkKey(fi); Value val = H2O.get(k); // Local-get only if( val == null ) // Missing? Fill in w/zero chunk H2O.putIfMatch(k, new Value(k, new C0DChunk(0, fnlines)), null); } } }); } ForkJoinTask.invokeAll(rs); } @Override public void reduce( SVFTask drt ) {} } private static Vec getVec(Key key) { Object o = UKV.get(key); return o instanceof Vec ? (ByteVec) o : ((Frame) o).vecs()[0]; } private static String [] genericColumnNames(int ncols){ String [] res = new String[ncols]; for(int i = 0; i < res.length; ++i) res[i] = "C" + String.valueOf(i+1); return res; } // Log information about the dataset we just parsed. private static void logParseResults(ParseDataset2 job, Frame fr) { try { long numRows = fr.anyVec().length(); Log.info("Parse result for " + job.dest() + " (" + Long.toString(numRows) + " rows):"); Vec[] vecArr = fr.vecs(); for( int i = 0; i < vecArr.length; i++ ) { Vec v = vecArr[i]; boolean isCategorical = v.isEnum(); boolean isConstant = (v.min() == v.max()); String CStr = String.format("C%d:", i+1); String typeStr = String.format("%s", (v._isUUID ? "UUID" : (isCategorical ? "categorical" : "numeric"))); String minStr = String.format("min(%f)", v.min()); String maxStr = String.format("max(%f)", v.max()); long numNAs = v.naCnt(); String naStr = (numNAs > 0) ? String.format("na(%d)", numNAs) : ""; String isConstantStr = isConstant ? "constant" : ""; String numLevelsStr = isCategorical ? String.format("numLevels(%d)", v.domain().length) : ""; boolean printLogSeparatorToStdout = false; boolean printColumnToStdout; { // Print information to stdout for this many leading columns. final int MAX_HEAD_TO_PRINT_ON_STDOUT = 10; // Print information to stdout for this many trailing columns. final int MAX_TAIL_TO_PRINT_ON_STDOUT = 10; if (vecArr.length <= (MAX_HEAD_TO_PRINT_ON_STDOUT + MAX_TAIL_TO_PRINT_ON_STDOUT)) { // For small numbers of columns, print them all. printColumnToStdout = true; } else if (i < MAX_HEAD_TO_PRINT_ON_STDOUT) { printColumnToStdout = true; } else if (i == MAX_HEAD_TO_PRINT_ON_STDOUT) { printLogSeparatorToStdout = true; printColumnToStdout = false; } else if ((i + MAX_TAIL_TO_PRINT_ON_STDOUT) < vecArr.length) { printColumnToStdout = false; } else { printColumnToStdout = true; } } if (printLogSeparatorToStdout) { System.out.println("Additional column information only sent to log file..."); } if (printColumnToStdout) { // Log to both stdout and log file. Log.info(String.format(" %-8s %15s %20s %20s %15s %11s %16s", CStr, typeStr, minStr, maxStr, naStr, isConstantStr, numLevelsStr)); } else { // Log only to log file. Log.info_no_stdout(String.format(" %-8s %15s %20s %20s %15s %11s %16s", CStr, typeStr, minStr, maxStr, naStr, isConstantStr, numLevelsStr)); } } Log.info(FrameUtils.chunkSummary(fr).toString()); } catch (Exception ignore) {} // Don't fail due to logging issues. Just ignore them. } // -------------------------------------------------------------------------- // Top-level parser driver private static void parse_impl(ParseDataset2 job, Key[] fkeys, CustomParser.ParserSetup setup, boolean delete_on_done) { assert setup._ncols > 0; if( fkeys.length == 0) { job.cancel(); return; } // Remove any previous instance and insert a sentinel (to ensure no one has // been writing to the same keys during our parse)! Vec v = getVec(fkeys[0]); int reserveKeys = setup._pType == ParserType.SVMLight?25000000:setup._ncols; VectorGroup vg = v.group(); int vecIdStart = vg.reserveKeys(reserveKeys); MultiFileParseTask mfpt = job._mfpt = new MultiFileParseTask(v.group(),vecIdStart,setup,job._progress); mfpt.invoke(fkeys); EnumUpdateTask eut = null; // Calculate enum domain int n = 0; AppendableVec [] avs = mfpt.vecs(); if((avs.length + vecIdStart) < reserveKeys) { Future f = vg.tryReturnKeys(vecIdStart + reserveKeys, vecIdStart + avs.length); if (f != null) try { f.get(); } catch (InterruptedException e) { } catch (ExecutionException e) {} } int [] ecols = new int[avs.length]; for( int i = 0; i < ecols.length; ++i ) if(avs[i].shouldBeEnum()) ecols[n++] = i; ecols = Arrays.copyOf(ecols, n); if( ecols.length > 0 ) { EnumFetchTask eft = new EnumFetchTask(H2O.SELF.index(), mfpt._eKey, ecols).invokeOnAllNodes(); Enum[] enums = eft._gEnums; ValueString[][] ds = new ValueString[ecols.length][]; int j = 0; for( int i : ecols ) avs[i]._domain = ValueString.toString(ds[j++] = enums[i].computeColumnDomain()); eut = new EnumUpdateTask(ds, eft._lEnums, mfpt._chunk2Enum, mfpt._eKey, ecols); } final Frame fr = new Frame(job.dest(),setup._columnNames != null?setup._columnNames:genericColumnNames(avs.length),AppendableVec.closeAll(avs)); // SVMLight is sparse format, there may be missing chunks with all 0s, fill them in new SVFTask(fr).invokeOnAllNodes(); fr.checkCompatible(); // Update enums to the globally agreed numbering if( eut != null ) { Vec[] evecs = new Vec[ecols.length]; for( int i = 0; i < evecs.length; ++i ) evecs[i] = fr.vecs()[ecols[i]]; eut.doAll(evecs); } Futures fs = new Futures(); for(Vec v2:fr.vecs()) v2.rollupStats(fs); fs.blockForPending(); logParseResults(job, fr); // Release the frame for overwriting fr.unlock(job.self()); // Remove CSV files from H2O memory if( delete_on_done ) for( Key k : fkeys ) Lockable.delete(k,job.self()); else for( Key k : fkeys ) { Lockable l = UKV.get(k); l.unlock(job.self()); } job.remove(); } public static class ParseProgressMonitor extends Iced implements Job.ProgressMonitor { final Key _progressKey; private long _progress; public ParseProgressMonitor(Key pKey){_progressKey = pKey;} @Override public void update(long n) { ParseDataset2.onProgress(n, _progressKey); _progress += n; } public long progress() { return _progress; } } // -------------------------------------------------------------------------- // We want to do a standard MRTask with a collection of file-keys (so the // files are parsed in parallel across the cluster), but we want to throttle // the parallelism on each node. private static class MultiFileParseTask extends MRTask<MultiFileParseTask> { private final CustomParser.ParserSetup _setup; // The expected column layout private final VectorGroup _vg; // vector group of the target dataset private final int _vecIdStart; // Start of available vector keys // Shared against all concurrent unrelated parses, a map to the node-local // Enum lists for each concurrent parse. private static NonBlockingHashMap<Key, Enum[]> _enums = new NonBlockingHashMap<Key, Enum[]>(); // The Key used to sort out *this* parse's Enum[] private final Key _eKey = Key.make(); private final Key _progress; // Mapping from Chunk# to cluster-node-number holding the enum mapping. // It is either self for all the non-parallel parses, or the Chunk-home for parallel parses. private int[] _chunk2Enum; // All column data for this one file // Vec _vecs[];M // A mapping of Key+ByteVec to rolling total Chunk counts. private IcedHashMap<Key,IcedInt> _fileChunkOffsets; // OUTPUT fields: private FVecDataOut [] _dout; public AppendableVec[] vecs(){ if(_dout.length == 1) return _dout[0]._vecs; int nCols = 0; for(FVecDataOut dout:_dout) nCols = Math.max(dout._vecs.length,nCols); AppendableVec [] res = new AppendableVec[nCols]; int nchunks = 0; for(FVecDataOut dout:_dout) nchunks += dout.nChunks(); long [] espc = MemoryManager.malloc8(nchunks); for(int i = 0; i < res.length; ++i) { res[i] = new AppendableVec(_vg.vecKey(_vecIdStart + i), espc, 0); res[i]._chunkTypes = MemoryManager.malloc1(nchunks); } for(int i = 0; i < _dout.length; ++i) for(int j = 0; j < _dout[i]._vecs.length; ++j) res[j].setSubRange(_dout[i]._vecs[j]); return res; } public String _parserr; // NULL if parse is OK, else an error string MultiFileParseTask(VectorGroup vg, int vecIdStart, CustomParser.ParserSetup setup, Key progress ) { _vg = vg; _setup = setup; _progress = progress; _vecIdStart = vecIdStart; _runSingleThreaded = true; } @Override public void init(){ super.init(); _dout = new FVecDataOut[_keys.length]; } @Override public MultiFileParseTask dfork(Key... keys){ // init(); _fileChunkOffsets = new IcedHashMap<Key, IcedInt>(); int len = 0; for( Key k:keys) { _fileChunkOffsets.put(k,new IcedInt(len)); len += getVec(k).nChunks(); } // Mapping from Chunk# to cluster-node-number _chunk2Enum = MemoryManager.malloc4(len); Arrays.fill(_chunk2Enum, -1); return super.dfork(keys); } private FVecDataOut makeDout(ParserSetup localSetup, int chunkOff, int nchunks) { AppendableVec [] avs = new AppendableVec[localSetup._ncols]; long [] espc = MemoryManager.malloc8(nchunks); for(int i = 0; i < avs.length; ++i) avs[i] = new AppendableVec(_vg.vecKey(i + _vecIdStart),espc, chunkOff); return localSetup._pType == ParserType.SVMLight ?new SVMLightFVecDataOut(_vg,chunkOff,avs,_vecIdStart,chunkOff,enums(_eKey,localSetup._ncols)) :new FVecDataOut(_vg, chunkOff, chunkOff, enums(_eKey,localSetup._ncols), avs); } // Called once per file @Override public void map( Key key ) { // Get parser setup info for this chunk ByteVec vec = (ByteVec) getVec(key); byte [] bits = vec.chunkForChunkIdx(0)._mem; if(bits == null || bits.length == 0){ assert false:"encountered empty file during multifile parse? should've been filtered already"; return; // should not really get here } final int chunkOff = _fileChunkOffsets.get(key)._val; Compression cpr = Utils.guessCompressionMethod(bits); CustomParser.ParserSetup localSetup = GuessSetup.guessSetup(Utils.unzipBytes(bits,cpr), _setup,false)._setup; // Local setup: nearly the same as the global all-files setup, but maybe // has the header-flag changed. if(!localSetup.isCompatible(_setup)) { _parserr = "Conflicting file layouts, expecting: " + _setup + " but found " + localSetup; return; } // Allow dup headers, if they are equals-ignoring-case boolean has_hdr = _setup._header && localSetup._header; if( has_hdr ) { // Both have headers? for( int i = 0; has_hdr && i < localSetup._columnNames.length; ++i ) has_hdr = localSetup._columnNames[i].equalsIgnoreCase(_setup._columnNames[i]); if( !has_hdr ) // Headers not compatible? // Then treat as no-headers, i.e., parse it as a normal row localSetup = new CustomParser.ParserSetup(ParserType.CSV,localSetup._separator, false); } // Parse the file try { switch( cpr ) { case NONE: if( localSetup._pType.parallelParseSupported ) { DParse dp = new DParse(_vg, localSetup, _vecIdStart, chunkOff,this, vec.nChunks()); addToPendingCount(1); dp._removeKey = vec._key; dp.exec(new Frame(vec)); for( int i = 0; i < vec.nChunks(); ++i ) _chunk2Enum[chunkOff + i] = vec.chunkKey(i).home_node().index(); } else { ParseProgressMonitor pmon = new ParseProgressMonitor(_progress); _dout[_lo] = streamParse(vec.openStream(pmon), localSetup, makeDout(localSetup,chunkOff,vec.nChunks()), pmon); for(int i = 0; i < vec.nChunks(); ++i) _chunk2Enum[chunkOff + i] = H2O.SELF.index(); } break; case ZIP: { // Zipped file; no parallel decompression; ParseProgressMonitor pmon = new ParseProgressMonitor(_progress); ZipInputStream zis = new ZipInputStream(vec.openStream(pmon)); ZipEntry ze = zis.getNextEntry(); // Get the *FIRST* entry // SVMLightFVecDataOut(VectorGroup vg, int cidx, AppendableVec [] avs, int vecIdStart, int chunkOff, Enum [] enums) // There is at least one entry in zip file and it is not a directory. if( ze != null && !ze.isDirectory() ) _dout[_lo] = streamParse(zis, localSetup, makeDout(localSetup,chunkOff,vec.nChunks()), pmon); else zis.close(); // Confused: which zipped file to decompress // set this node as the one which rpocessed all the chunks for(int i = 0; i < vec.nChunks(); ++i) _chunk2Enum[chunkOff + i] = H2O.SELF.index(); break; } case GZIP: // Zipped file; no parallel decompression; ParseProgressMonitor pmon = new ParseProgressMonitor(_progress); _dout[_lo] = streamParse(new GZIPInputStream(vec.openStream(pmon)),localSetup,makeDout(localSetup,chunkOff,vec.nChunks()), pmon); // set this node as the one which processed all the chunks for(int i = 0; i < vec.nChunks(); ++i) _chunk2Enum[chunkOff + i] = H2O.SELF.index(); break; } } catch( IOException ioe ) { throw new RuntimeException(ioe); } } // Reduce: combine errors from across files. // Roll-up other meta data @Override public void reduce( MultiFileParseTask mfpt ) { assert this != mfpt; // Combine parse errors from across files if( _parserr == null ) _parserr = mfpt._parserr; else if( mfpt._parserr != null ) _parserr += mfpt._parserr; // Collect & combine columns across files if( _dout == null ) _dout = mfpt._dout; else if(_dout != mfpt._dout) _dout = Utils.append(_dout,mfpt._dout); if( _chunk2Enum == null ) _chunk2Enum = mfpt._chunk2Enum; else if(_chunk2Enum != mfpt._chunk2Enum) { // we're sharing global array! for( int i = 0; i < _chunk2Enum.length; ++i ) { if( _chunk2Enum[i] == -1 ) _chunk2Enum[i] = mfpt._chunk2Enum[i]; else assert mfpt._chunk2Enum[i] == -1 : Arrays.toString(_chunk2Enum) + " :: " + Arrays.toString(mfpt._chunk2Enum); } } } private static Enum [] enums(Key eKey, int ncols){ if(!_enums.containsKey(eKey)){ Enum [] enums = new Enum[ncols]; for(int i = 0; i < enums.length; ++i)enums[i] = new Enum(); _enums.putIfAbsent(eKey, enums); } return _enums.get(eKey); } // ------------------------------------------------------------------------ // Zipped file; no parallel decompression; decompress into local chunks, // parse local chunks; distribute chunks later. private FVecDataOut streamParse( final InputStream is, final CustomParser.ParserSetup localSetup, FVecDataOut dout, ParseProgressMonitor pmon) throws IOException { // All output into a fresh pile of NewChunks, one per column CustomParser p = localSetup.parser(); // assume 2x inflation rate //if( localSetup._pType.parallelParseSupported ) if( localSetup._pType.parallelParseSupported ) try{p.streamParse(is, dout, pmon);}catch(IOException e){throw new RuntimeException(e);} else try{p.streamParse(is, dout);}catch(Exception e){throw new RuntimeException(e);} // Parse all internal "chunks", until we drain the zip-stream dry. Not // real chunks, just flipping between 32K buffers. Fills up the single // very large NewChunk. dout.close(_fs); return dout; } // ------------------------------------------------------------------------ private static class DParse extends MRTask2<DParse> { private final CustomParser.ParserSetup _setup; private final int _vecIdStart; private final int _chunkOff; // for multifile parse, offset of the first chunk in the final dataset private final VectorGroup _vg; private FVecDataOut _dout; private final Key _eKey; final Key _progress; Key _removeKey; private transient final MultiFileParseTask _outerMFPT; final int _nchunks; private transient NonBlockingSetInt _visited; private transient long [] _espc; DParse(VectorGroup vg, CustomParser.ParserSetup setup, int vecIdstart, int startChunkIdx, MultiFileParseTask mfpt, int nchunks) { super(mfpt); _vg = vg; _setup = setup; _vecIdStart = vecIdstart; _chunkOff = startChunkIdx; _outerMFPT = mfpt; _eKey = mfpt._eKey; _progress = mfpt._progress; _nchunks = nchunks; } @Override public void setupLocal(){ super.setupLocal(); _espc = MemoryManager.malloc8(_nchunks); _visited = new NonBlockingSetInt(); } @Override public void map( Chunk in) { AppendableVec [] avs = new AppendableVec[_setup._ncols]; for(int i = 0; i < avs.length; ++i) avs[i] = new AppendableVec(_vg.vecKey(_vecIdStart + i), _espc, _chunkOff); Enum [] enums = enums(_eKey,_setup._ncols); // Break out the input & output vectors before the parse loop // The Parser FVecDataIn din = new FVecDataIn(in); FVecDataOut dout; CustomParser p; switch(_setup._pType) { case CSV: p = new CsvParser(_setup); dout = new FVecDataOut(_vg,_chunkOff, _chunkOff + in.cidx(),enums,avs); break; case SVMLight: p = new SVMLightParser(_setup); dout = new SVMLightFVecDataOut(_vg, _chunkOff + in.cidx(), avs, _vecIdStart, _chunkOff, enums); break; default: throw H2O.unimpl(); } p.parallelParse(in.cidx(),din,dout); (_dout = dout).close(_fs); onProgress(in._len, _progress); // Record bytes parsed // remove parsed data right away (each chunk is used by 2) final int cidx = in.cidx(); if(!_visited.add(cidx)) { Value v = H2O.get(in._vec.chunkKey(cidx)); if(v != null && v.isPersisted()) { v.freePOJO(); v.freeMem(); } } if(!_visited.add(cidx+1)) { Value v = H2O.get(in._vec.chunkKey(cidx+1)); if(v != null && v.isPersisted()) { v.freePOJO(); v.freeMem(); } } } @Override public void reduce(DParse dp) { if(_dout == null)_dout = dp._dout; else _dout.reduce(dp._dout); } @Override public void postGlobal() { super.postGlobal(); _outerMFPT._dout[_outerMFPT._lo] = _dout; _dout = null; // Reclaim GC eagerly if(_removeKey != null) UKV.remove(_removeKey); } } // Find & remove all partially built output chunks & vecs private Futures onExceptionCleanup(Futures fs) { int nchunks = _chunk2Enum.length; int ncols = _setup._ncols; for( int i = 0; i < ncols; ++i ) { Key vkey = _vg.vecKey(_vecIdStart + i); DKV.remove(vkey,fs); for( int c = 0; c < nchunks; ++c ) DKV.remove(Vec.chunkKey(vkey,c),fs); } cancel(true); return fs; } } // ------------------------------------------------------------------------ /** Parsed data output specialized for fluid vecs. * @author tomasnykodym */ static class FVecDataOut extends Iced implements CustomParser.StreamDataOut { protected transient NewChunk [] _nvs; public final int _chunkOff; protected AppendableVec []_vecs; private transient final Enum [] _enums; protected byte [] _ctypes; long _nLines; int _col = -1; final int _cidx; int _nChunks; boolean _closedVecs = false; private final VectorGroup _vg; static final protected byte UCOL = 0; // unknown col type static final protected byte NCOL = 1; // numeric col type static final protected byte ECOL = 2; // enum col type static final protected byte TCOL = 3; // time col typ static final protected byte ICOL = 4; // UUID col typ public int nChunks(){return _nChunks;} // private static AppendableVec[] newAppendables(int n, VectorGroup vg, int vecIdStart){ // AppendableVec [] apps = new AppendableVec[n]; // // for(int i = 0; i < n; ++i) // apps[i] = new AppendableVec(vg.vecKey(vecIdStart + i)); // return apps; // } // public FVecDataOut(VectorGroup vg, int cidx, int ncols, int vecIdStart, Enum[] enums){ // this(vg,cidx,ncols,vecIdStart,enums,newAppendables(ncols,vg,vecIdStart)); // } public FVecDataOut(VectorGroup vg, int chunkOff, int cidx, Enum [] enums, AppendableVec [] appendables){ assert cidx - chunkOff >= 0:"incompatible cidx/chunkOff " + chunkOff + ", " + cidx; _vecs = appendables; _chunkOff = chunkOff; _enums = enums; _cidx = cidx; _vg = vg; _ctypes = MemoryManager.malloc1(appendables.length); _nvs = new NewChunk[appendables.length]; for(int i = 0; i < appendables.length; ++i) _nvs[i] = (NewChunk)_vecs[i].chunkForChunkIdx(_cidx); } @Override public FVecDataOut reduce(StreamDataOut sdout){ FVecDataOut dout = (FVecDataOut)sdout; _nChunks += dout._nChunks; if( dout!=null && _vecs != dout._vecs){ if(dout._vecs.length > _vecs.length){ AppendableVec [] v = _vecs; _vecs = dout._vecs; for(int i = 1; i < _vecs.length; ++i) _vecs[i]._espc = _vecs[0]._espc; dout._vecs = v; } for(int i = 0; i < dout._vecs.length; ++i) _vecs[i].reduce(dout._vecs[i]); } return this; } @Override public FVecDataOut close(){ Futures fs = new Futures(); close(fs); fs.blockForPending(); return this; } public void check(){ if(_nvs != null) for(NewChunk nv:_nvs) assert (nv._len == _nLines):"unexpected number of lines in NewChunk, got " + nv._len + ", but expected " + _nLines; } @Override public FVecDataOut close(final Futures fs){ if( _nvs == null ) return this; // Might call close twice for(NewChunk nc:_nvs) assert nc._len == _nLines:"incompatible lengths after parsing chunk, " + _nLines + " != " + nc._len; RecursiveAction [] rs = new RecursiveAction[_nvs.length]; for(int i = 0; i < _nvs.length; ++i) { final int fi = i; rs[i] = new RecursiveAction() { @Override protected void compute() { _nvs[fi].close(_cidx, fs); _nvs[fi] = null; // Free for GC } }; } ForkJoinTask.invokeAll(rs); _nChunks++; _nvs = null; // Free for GC return this; } @Override public FVecDataOut nextChunk(){ return new FVecDataOut(_vg, _chunkOff,_cidx+1,_enums, _vecs); } // protected Vec [] closeVecs(){ // Futures fs = new Futures(); // _closedVecs = true; // Vec [] res = new Vec[_vecs.length]; // for(int i = 0; i < _vecs.length; ++i) // res[i] = _vecs[i].close(fs); // _vecs = null; // Free for GC // fs.blockForPending(); // return res; // } @Override public void newLine() { if(_col >= 0){ ++_nLines; for(int i = _col+1; i < _vecs.length; ++i) addInvalidCol(i); } _col = -1; } @Override public void addNumCol(int colIdx, long number, int exp) { if( colIdx < _vecs.length ) { _nvs[_col = colIdx].addNum(number, exp); if(_ctypes[colIdx] == UCOL ) _ctypes[colIdx] = NCOL; } } @Override public void addInvalidCol(int colIdx) { if(colIdx < _vecs.length) _nvs[_col = colIdx].addNA(); } @Override public final boolean isString(int colIdx) { return false; } @Override public final void addStrCol(int colIdx, ValueString str) { if(colIdx < _nvs.length){ if(_ctypes[colIdx] == NCOL){ // support enforced types addInvalidCol(colIdx); return; } if(_ctypes[colIdx] == UCOL && ParseTime.attemptTimeParse(str) > 0) _ctypes[colIdx] = TCOL; if( _ctypes[colIdx] == UCOL ) { // Attempt UUID parse int old = str.get_off(); ParseTime.attemptUUIDParse0(str); ParseTime.attemptUUIDParse1(str); if( str.get_off() != -1 ) _ctypes[colIdx] = ICOL; str.setOff(old); } if( _ctypes[colIdx] == TCOL ) { long l = ParseTime.attemptTimeParse(str); if( l == Long.MIN_VALUE ) addInvalidCol(colIdx); else { int time_pat = ParseTime.decodePat(l); // Get time pattern l = ParseTime.decodeTime(l); // Get time addNumCol(colIdx, l, 0); // Record time in msec _nvs[_col]._timCnt[time_pat]++; // Count histo of time parse patterns } } else if( _ctypes[colIdx] == ICOL ) { // UUID column? Only allow UUID parses long lo = ParseTime.attemptUUIDParse0(str); long hi = ParseTime.attemptUUIDParse1(str); if( str.get_off() == -1 ) { lo = C16Chunk._LO_NA; hi = C16Chunk._HI_NA; } if( colIdx < _vecs.length ) _nvs[_col = colIdx].addUUID(lo, hi); } else if(!_enums[_col = colIdx].isKilled()) { // store enum id into exponent, so that it will be interpreted as NA if compressing as numcol. int id = _enums[colIdx].addKey(str); if(_ctypes[colIdx] == UCOL && id > 1) _ctypes[colIdx] = ECOL; _nvs[colIdx].addEnum(id); } else // turn the column into NAs by adding value overflowing Enum.MAX_SIZE _nvs[colIdx].addEnum(Integer.MAX_VALUE); } //else System.err.println("additional column (" + colIdx + ":" + str + ") on line " + linenum()); } /** Adds double value to the column. */ @Override public void addNumCol(int colIdx, double value) { if (Double.isNaN(value)) { addInvalidCol(colIdx); } else { double d= value; int exp = 0; long number = (long)d; while (number != d) { d = d * 10; --exp; number = (long)d; } addNumCol(colIdx, number, exp); } } @Override public void setColumnNames(String [] names){} @Override public final void rollbackLine() {} @Override public void invalidLine(String err) { newLine(); } @Override public void invalidValue(int line, int col) {} } // ------------------------------------------------------------------------ /** Parser data in taking data from fluid vec chunk. * @author tomasnykodym */ private static class FVecDataIn implements CustomParser.DataIn { final Vec _vec; Chunk _chk; int _idx; final long _firstLine; public FVecDataIn(Chunk chk){ _chk = chk; _idx = _chk.cidx(); _firstLine = _chk._start; _vec = chk._vec; } @Override public byte[] getChunkData(int cidx) { if(cidx != _idx) _chk = cidx < _vec.nChunks()?_vec.chunkForChunkIdx(_idx = cidx):null; return (_chk == null)?null:_chk._mem; } @Override public int getChunkDataStart(int cidx) { return -1; } @Override public void setChunkDataStart(int cidx, int offset) { } } }