package water.fvec;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.zip.*;
import jsr166y.CountedCompleter;
import jsr166y.ForkJoinTask;
import jsr166y.ForkJoinWorkerThread;
import jsr166y.RecursiveAction;
import water.*;
import water.H2O.H2OCountedCompleter;
import water.fvec.Vec.VectorGroup;
import water.nbhm.NonBlockingHashMap;
import water.nbhm.NonBlockingSetInt;
import water.parser.*;
import water.parser.CustomParser.DataOut;
import water.parser.CustomParser.ParserSetup;
import water.parser.CustomParser.ParserType;
import water.parser.CustomParser.StreamDataOut;
import water.parser.Enum;
import water.util.FrameUtils;
import water.util.Log;
import water.util.Utils.IcedHashMap;
import water.util.Utils.IcedInt;
import water.util.Utils;
public final class ParseDataset2 extends Job {
public final Key _progress; // Job progress Key
private MultiFileParseTask _mfpt; // Access to partially built vectors for cleanup after parser crash
public static enum Compression { NONE, ZIP, GZIP }
public static Key [] filterEmptyFiles(Key [] keys){
Arrays.sort(keys);
// first check if there are any empty files and if so remove them
Vec [] vecs = new Vec [keys.length];
int c = 0;
for(int i = 0; i < vecs.length; ++i) {
vecs[i] = getVec(keys[i]);
if(vecs[i].length() == 0) c++;
}
if(c > 0){ // filter out empty files
Key[] ks = new Key[keys.length-c];
Vec[] vs = new Vec[vecs.length-c];
int j = 0;
for(int i = 0; i < keys.length; ++i)
if(vecs[i].length() != 0){
ks[j] = keys[i];
vs[j] = vecs[i];
++j;
}
keys = ks;
}
return keys;
}
// --------------------------------------------------------------------------
// Parse an array of csv input/file keys into an array of distributed output Vecs
public static Frame parse(Key okey, Key [] keys) {
return parse(okey,keys,new GuessSetup.GuessSetupTsk(new ParserSetup(),true).invoke(keys)._gSetup._setup,true);
}
public static Frame parse(Key okey, Key[] keys, CustomParser.ParserSetup globalSetup, boolean delete_on_done) {
if( globalSetup._ncols == 0 ) throw new java.lang.IllegalArgumentException(globalSetup.toString());
return forkParseDataset(okey, keys, globalSetup, delete_on_done).get();
}
// Same parse, as a backgroundable Job
public static ParseDataset2 forkParseDataset(final Key dest, Key[] keys, final CustomParser.ParserSetup setup, boolean delete_on_done) {
keys = filterEmptyFiles(keys);
setup.checkDupColumnNames();
// Some quick sanity checks: no overwriting your input key, and a resource check.
long sum=0;
for( Key k : keys ) {
if( dest.equals(k) )
throw new IllegalArgumentException("Destination key "+dest+" must be different from all sources");
sum += DKV.get(k).length(); // Sum of all input filesizes
}
long memsz=0; // Cluster memory
for( H2ONode h2o : H2O.CLOUD._memary )
memsz += h2o.get_max_mem();
if( sum > memsz*4 )
throw new IllegalArgumentException("Total input file size of "+PrettyPrint.bytes(sum)+" is much larger than total cluster memory of "+PrettyPrint.bytes(memsz)+", please use either a larger cluster or smaller data.");
ParseDataset2 job = new ParseDataset2(dest, keys);
new Frame(job.dest(),new String[0],new Vec[0]).delete_and_lock(job.self()); // Lock BEFORE returning
for( Key k : keys ) Lockable.read_lock(k,job.self()); // Lock BEFORE returning
ParserFJTask fjt = new ParserFJTask(job, keys, setup, delete_on_done); // Fire off background parse
// Make a wrapper class that only *starts* when the ParserFJTask fjt
// completes - especially it only starts even when fjt completes
// exceptionally... thus the fjt onExceptionalCompletion code runs
// completely before this empty task starts - providing a simple barrier.
// Threads blocking on the job will block on the "cleanup" task, which will
// block until the fjt runs the onCompletion or onExceptionCompletion code.
H2OCountedCompleter cleanup = new H2OCountedCompleter() {
@Override public void compute2() { }
@Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) { return true; }
};
fjt.setCompleter(cleanup);
job.start(cleanup);
H2O.submitTask(fjt);
return job;
}
// Setup a private background parse job
private ParseDataset2(Key dest, Key[] fkeys) {
destination_key = dest;
// Job progress Key
_progress = Key.make((byte) 0, Key.JOB);
UKV.put(_progress, ParseProgress.make(fkeys));
}
// Simple internal class doing background parsing, with trackable Job status
public static class ParserFJTask extends H2OCountedCompleter {
final ParseDataset2 _job;
Key[] _keys;
CustomParser.ParserSetup _setup;
boolean _delete_on_done;
public ParserFJTask( ParseDataset2 job, Key[] keys, CustomParser.ParserSetup setup, boolean delete_on_done) {
_job = job;
_keys = keys;
_setup = setup;
_delete_on_done = delete_on_done;
}
@Override public void compute2() {
parse_impl(_job, _keys, _setup, _delete_on_done);
tryComplete();
}
// Took a crash/NPE somewhere in the parser. Attempt cleanup.
@Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller){
Futures fs = new Futures();
if( _job != null ) {
UKV.remove(_job.destination_key,fs);
UKV.remove(_job._progress,fs);
// Find & remove all partially-built output vecs & chunks
if( _job._mfpt != null ) _job._mfpt.onExceptionCleanup(fs);
}
// Assume the input is corrupt - or already partially deleted after
// parsing. Nuke it all - no partial Vecs lying around.
for( Key k : _keys ) UKV.remove(k,fs);
fs.blockForPending();
// As soon as the job is canceled, threads blocking on the job will
// wake up. Better have all cleanup done first!
if( _job != null ) _job.cancel(ex);
return true;
}
}
// --------------------------------------------------------------------------
// Parser progress
static class ParseProgress extends Iced {
final long _total;
long _value;
DException _ex;
ParseProgress(long val, long total){_value = val; _total = total;}
// Total number of steps is equal to total bytecount across files
static ParseProgress make( Key[] fkeys ) {
long total = 0;
for( Key fkey : fkeys )
total += getVec(fkey).length();
return new ParseProgress(0,total);
}
public void setException(DException ex){_ex = ex;}
public DException getException(){return _ex;}
}
static void onProgress(final long len, final Key progress) {
new TAtomic<ParseProgress>() {
@Override public ParseProgress atomic(ParseProgress old) {
if (old == null) return null;
old._value += len;
return old;
}
}.fork(progress);
}
@Override public float progress() {
ParseProgress progress = UKV.get(_progress);
if( progress == null || progress._total == 0 ) return 0;
return progress._value / (float) progress._total;
}
@Override public void remove() {
DKV.remove(_progress);
super.remove();
}
/** Task to update enum values to match the global numbering scheme.
* Performs update in place so that values originally numbered using
* node-local unordered numbering will be numbered using global numbering.
* @author tomasnykodym
*/
private static class EnumUpdateTask extends MRTask2<EnumUpdateTask> {
private transient int[][][] _emap;
final Key _eKey;
private final ValueString [][] _gDomain;
private final Enum [][] _lEnums;
private final int [] _chunk2Enum;
private final int [] _colIds;
private EnumUpdateTask(ValueString [][] gDomain, Enum [][] lEnums, int [] chunk2Enum, Key lDomKey, int [] colIds){
_gDomain = gDomain; _lEnums = lEnums; _chunk2Enum = chunk2Enum; _eKey = lDomKey; _colIds = colIds;
}
private int[][] emap(int nodeId) {
if( _emap == null ) _emap = new int[_lEnums.length][][];
if( _emap[nodeId] == null ) {
int[][] emap = new int[_gDomain.length][];
for( int i = 0; i < _gDomain.length; ++i ) {
if( _gDomain[i] != null ) {
assert _lEnums[nodeId] != null : "missing lEnum of node " + nodeId + ", enums = " + Arrays.toString(_lEnums);
final Enum e = _lEnums[nodeId][_colIds[i]];
emap[i] = new int[e.maxId()+1];
Arrays.fill(emap[i], -1);
for(int j = 0; j < _gDomain[i].length; ++j) {
ValueString vs = _gDomain[i][j];
if( e.containsKey(vs) ) {
assert e.getTokenId(vs) <= e.maxId():"maxIdx = " + e.maxId() + ", got " + e.getTokenId(vs);
emap[i][e.getTokenId(vs)] = j;
}
}
}
}
_emap[nodeId] = emap;
}
return _emap[nodeId];
}
@Override public void map(Chunk [] chks){
int[][] emap = emap(_chunk2Enum[chks[0].cidx()]);
final int cidx = chks[0].cidx();
for(int i = 0; i < chks.length; ++i) {
Chunk chk = chks[i];
if(_gDomain[i] == null) // killed, replace with all NAs
DKV.put(chk._vec.chunkKey(chk.cidx()),new C0DChunk(Double.NaN,chk._len));
else for( int j = 0; j < chk._len; ++j){
if( chk.isNA0(j) )continue;
long l = chk.at80(j);
if (l < 0 || l >= emap[i].length)
reportBrokenEnum(chk, i, j, l, emap);
if(emap[i][(int)l] < 0)
throw new RuntimeException(H2O.SELF.toString() + ": missing enum at col:" + i + ", line: " + j + ", val = " + l + ", chunk=" + chk.getClass().getSimpleName());
chk.set0(j, emap[i][(int)l]);
}
chk.close(cidx, _fs);
}
}
private void reportBrokenEnum( Chunk chk, int i, int j, long l, int[][] emap ) {
Chunk chk2 = chk._chk2;
chk._chk2 = null;
StringBuilder sb = new StringBuilder("Enum renumber task, column # " + i + ": Found OOB index " + l + " (expected 0 - " + emap[i].length + ", global domain has " + _gDomain[i].length + " levels) pulled from " + chk.getClass().getSimpleName() + "\n");
int k = 0;
for(; k < Math.min(5,chk._len); ++k)
sb.append("at8[" + (k+chk._start) + "] = " + chk.at80(k) + ", chk2 = " + (chk2 != null?chk2.at80(k):"") + "\n");
k = Math.max(k,j-2);
sb.append("...\n");
for(; k < Math.min(chk._len,j+2); ++k)
sb.append("at8[" + (k+chk._start) + "] = " + chk.at80(k) + ", chk2 = " + (chk2 != null?chk2.at80(k):"") + "\n");
sb.append("...\n");
k = Math.max(k,chk._len-5);
for(; k < chk._len; ++k)
sb.append("at8[" + (k+chk._start) + "] = " + chk.at80(k) + ", chk2 = " + (chk2 != null?chk2.at80(k):"") + "\n");
throw new RuntimeException(sb.toString());
}
}
// --------------------------------------------------------------------------
private static class EnumFetchTask extends MRTask<EnumFetchTask> {
private final Key _k;
private final int[] _ecols;
private final int _homeNode; // node where the computation started, enum from this node MUST be cloned!
private Enum[] _gEnums; // global enums per column
private Enum[][] _lEnums; // local enums per node per column
private EnumFetchTask(int homeNode, Key k, int[] ecols){_homeNode = homeNode; _k = k;_ecols = ecols;}
@Override public void map(Key key) {
_lEnums = new Enum[H2O.CLOUD.size()][];
if(MultiFileParseTask._enums.containsKey(_k)){
_lEnums[H2O.SELF.index()] = _gEnums = MultiFileParseTask._enums.get(_k);
// if we are the original node (i.e. there will be no sending over
// wire), we have to clone the enums not to share the same object
// (causes problems when computing column domain and renumbering maps).
if( H2O.SELF.index() == _homeNode ) {
_gEnums = _gEnums.clone();
for(int i = 0; i < _gEnums.length; ++i)
_gEnums[i] = _gEnums[i].clone();
}
MultiFileParseTask._enums.remove(_k);
}
}
@Override public void reduce(EnumFetchTask etk) {
if(_gEnums == null) {
_gEnums = etk._gEnums;
_lEnums = etk._lEnums;
} else if (etk._gEnums != null) {
for( int i : _ecols ) _gEnums[i].merge(etk._gEnums[i]);
for( int i = 0; i < _lEnums.length; ++i )
if( _lEnums[i] == null ) _lEnums[i] = etk._lEnums[i];
else assert etk._lEnums[i] == null;
}
}
}
// --------------------------------------------------------------------------
// Run once on all nodes; fill in missing zero chunks
private static class SVFTask extends MRTask<SVFTask> {
private final Frame _f;
private SVFTask( Frame f ) { _f = f; }
@Override public void map(Key key) {
Vec v0 = _f.anyVec();
ArrayList<RecursiveAction> rs = new ArrayList<RecursiveAction>();
for( int i = 0; i < v0.nChunks(); ++i ) {
if( !v0.chunkKey(i).home() ) continue;
final int fi = i;
rs.add(new RecursiveAction() {
@Override
protected void compute() {
// First find the nrows as the # rows of non-missing chunks; done on
// locally-homed chunks only - to keep the data distribution.
int nlines = 0;
for( Vec vec : _f.vecs() ) {
Value val = H2O.get(vec.chunkKey(fi)); // Local-get only
if( val != null ) {
nlines = ((Chunk)val.get())._len;
break;
}
}
final int fnlines = nlines;
// Now fill in appropriate-sized zero chunks
for(int j = 0; j < _f.numCols(); ++j) {
Vec vec = _f.vec(j);
Key k = vec.chunkKey(fi);
Value val = H2O.get(k); // Local-get only
if( val == null ) // Missing? Fill in w/zero chunk
H2O.putIfMatch(k, new Value(k, new C0DChunk(0, fnlines)), null);
}
}
});
}
ForkJoinTask.invokeAll(rs);
}
@Override public void reduce( SVFTask drt ) {}
}
private static Vec getVec(Key key) {
Object o = UKV.get(key);
return o instanceof Vec ? (ByteVec) o : ((Frame) o).vecs()[0];
}
private static String [] genericColumnNames(int ncols){
String [] res = new String[ncols];
for(int i = 0; i < res.length; ++i) res[i] = "C" + String.valueOf(i+1);
return res;
}
// Log information about the dataset we just parsed.
private static void logParseResults(ParseDataset2 job, Frame fr) {
try {
long numRows = fr.anyVec().length();
Log.info("Parse result for " + job.dest() + " (" + Long.toString(numRows) + " rows):");
Vec[] vecArr = fr.vecs();
for( int i = 0; i < vecArr.length; i++ ) {
Vec v = vecArr[i];
boolean isCategorical = v.isEnum();
boolean isConstant = (v.min() == v.max());
String CStr = String.format("C%d:", i+1);
String typeStr = String.format("%s", (v._isUUID ? "UUID" : (isCategorical ? "categorical" : "numeric")));
String minStr = String.format("min(%f)", v.min());
String maxStr = String.format("max(%f)", v.max());
long numNAs = v.naCnt();
String naStr = (numNAs > 0) ? String.format("na(%d)", numNAs) : "";
String isConstantStr = isConstant ? "constant" : "";
String numLevelsStr = isCategorical ? String.format("numLevels(%d)", v.domain().length) : "";
boolean printLogSeparatorToStdout = false;
boolean printColumnToStdout;
{
// Print information to stdout for this many leading columns.
final int MAX_HEAD_TO_PRINT_ON_STDOUT = 10;
// Print information to stdout for this many trailing columns.
final int MAX_TAIL_TO_PRINT_ON_STDOUT = 10;
if (vecArr.length <= (MAX_HEAD_TO_PRINT_ON_STDOUT + MAX_TAIL_TO_PRINT_ON_STDOUT)) {
// For small numbers of columns, print them all.
printColumnToStdout = true;
} else if (i < MAX_HEAD_TO_PRINT_ON_STDOUT) {
printColumnToStdout = true;
} else if (i == MAX_HEAD_TO_PRINT_ON_STDOUT) {
printLogSeparatorToStdout = true;
printColumnToStdout = false;
} else if ((i + MAX_TAIL_TO_PRINT_ON_STDOUT) < vecArr.length) {
printColumnToStdout = false;
} else {
printColumnToStdout = true;
}
}
if (printLogSeparatorToStdout) {
System.out.println("Additional column information only sent to log file...");
}
if (printColumnToStdout) {
// Log to both stdout and log file.
Log.info(String.format(" %-8s %15s %20s %20s %15s %11s %16s", CStr, typeStr, minStr, maxStr, naStr, isConstantStr, numLevelsStr));
}
else {
// Log only to log file.
Log.info_no_stdout(String.format(" %-8s %15s %20s %20s %15s %11s %16s", CStr, typeStr, minStr, maxStr, naStr, isConstantStr, numLevelsStr));
}
}
Log.info(FrameUtils.chunkSummary(fr).toString());
}
catch (Exception ignore) {} // Don't fail due to logging issues. Just ignore them.
}
// --------------------------------------------------------------------------
// Top-level parser driver
private static void parse_impl(ParseDataset2 job, Key[] fkeys, CustomParser.ParserSetup setup, boolean delete_on_done) {
assert setup._ncols > 0;
if( fkeys.length == 0) { job.cancel(); return; }
// Remove any previous instance and insert a sentinel (to ensure no one has
// been writing to the same keys during our parse)!
Vec v = getVec(fkeys[0]);
int reserveKeys = setup._pType == ParserType.SVMLight?25000000:setup._ncols;
VectorGroup vg = v.group();
int vecIdStart = vg.reserveKeys(reserveKeys);
MultiFileParseTask mfpt = job._mfpt = new MultiFileParseTask(v.group(),vecIdStart,setup,job._progress);
mfpt.invoke(fkeys);
EnumUpdateTask eut = null;
// Calculate enum domain
int n = 0;
AppendableVec [] avs = mfpt.vecs();
if((avs.length + vecIdStart) < reserveKeys) {
Future f = vg.tryReturnKeys(vecIdStart + reserveKeys, vecIdStart + avs.length);
if (f != null) try { f.get(); } catch (InterruptedException e) { } catch (ExecutionException e) {}
}
int [] ecols = new int[avs.length];
for( int i = 0; i < ecols.length; ++i )
if(avs[i].shouldBeEnum())
ecols[n++] = i;
ecols = Arrays.copyOf(ecols, n);
if( ecols.length > 0 ) {
EnumFetchTask eft = new EnumFetchTask(H2O.SELF.index(), mfpt._eKey, ecols).invokeOnAllNodes();
Enum[] enums = eft._gEnums;
ValueString[][] ds = new ValueString[ecols.length][];
int j = 0;
for( int i : ecols ) avs[i]._domain = ValueString.toString(ds[j++] = enums[i].computeColumnDomain());
eut = new EnumUpdateTask(ds, eft._lEnums, mfpt._chunk2Enum, mfpt._eKey, ecols);
}
final Frame fr = new Frame(job.dest(),setup._columnNames != null?setup._columnNames:genericColumnNames(avs.length),AppendableVec.closeAll(avs));
// SVMLight is sparse format, there may be missing chunks with all 0s, fill them in
new SVFTask(fr).invokeOnAllNodes();
fr.checkCompatible();
// Update enums to the globally agreed numbering
if( eut != null ) {
Vec[] evecs = new Vec[ecols.length];
for( int i = 0; i < evecs.length; ++i ) evecs[i] = fr.vecs()[ecols[i]];
eut.doAll(evecs);
}
Futures fs = new Futures();
for(Vec v2:fr.vecs())
v2.rollupStats(fs);
fs.blockForPending();
logParseResults(job, fr);
// Release the frame for overwriting
fr.unlock(job.self());
// Remove CSV files from H2O memory
if( delete_on_done ) for( Key k : fkeys ) Lockable.delete(k,job.self());
else for( Key k : fkeys ) {
Lockable l = UKV.get(k);
l.unlock(job.self());
}
job.remove();
}
public static class ParseProgressMonitor extends Iced implements Job.ProgressMonitor {
final Key _progressKey;
private long _progress;
public ParseProgressMonitor(Key pKey){_progressKey = pKey;}
@Override public void update(long n) {
ParseDataset2.onProgress(n, _progressKey);
_progress += n;
}
public long progress() {
return _progress;
}
}
// --------------------------------------------------------------------------
// We want to do a standard MRTask with a collection of file-keys (so the
// files are parsed in parallel across the cluster), but we want to throttle
// the parallelism on each node.
private static class MultiFileParseTask extends MRTask<MultiFileParseTask> {
private final CustomParser.ParserSetup _setup; // The expected column layout
private final VectorGroup _vg; // vector group of the target dataset
private final int _vecIdStart; // Start of available vector keys
// Shared against all concurrent unrelated parses, a map to the node-local
// Enum lists for each concurrent parse.
private static NonBlockingHashMap<Key, Enum[]> _enums = new NonBlockingHashMap<Key, Enum[]>();
// The Key used to sort out *this* parse's Enum[]
private final Key _eKey = Key.make();
private final Key _progress;
// Mapping from Chunk# to cluster-node-number holding the enum mapping.
// It is either self for all the non-parallel parses, or the Chunk-home for parallel parses.
private int[] _chunk2Enum;
// All column data for this one file
// Vec _vecs[];M
// A mapping of Key+ByteVec to rolling total Chunk counts.
private IcedHashMap<Key,IcedInt> _fileChunkOffsets;
// OUTPUT fields:
private FVecDataOut [] _dout;
public AppendableVec[] vecs(){
if(_dout.length == 1)
return _dout[0]._vecs;
int nCols = 0;
for(FVecDataOut dout:_dout)
nCols = Math.max(dout._vecs.length,nCols);
AppendableVec [] res = new AppendableVec[nCols];
int nchunks = 0;
for(FVecDataOut dout:_dout)
nchunks += dout.nChunks();
long [] espc = MemoryManager.malloc8(nchunks);
for(int i = 0; i < res.length; ++i) {
res[i] = new AppendableVec(_vg.vecKey(_vecIdStart + i), espc, 0);
res[i]._chunkTypes = MemoryManager.malloc1(nchunks);
}
for(int i = 0; i < _dout.length; ++i)
for(int j = 0; j < _dout[i]._vecs.length; ++j)
res[j].setSubRange(_dout[i]._vecs[j]);
return res;
}
public String _parserr; // NULL if parse is OK, else an error string
MultiFileParseTask(VectorGroup vg, int vecIdStart, CustomParser.ParserSetup setup, Key progress ) {
_vg = vg; _setup = setup; _progress = progress;
_vecIdStart = vecIdStart;
_runSingleThreaded = true;
}
@Override public void init(){
super.init();
_dout = new FVecDataOut[_keys.length];
}
@Override
public MultiFileParseTask dfork(Key... keys){
// init();
_fileChunkOffsets = new IcedHashMap<Key, IcedInt>();
int len = 0;
for( Key k:keys) {
_fileChunkOffsets.put(k,new IcedInt(len));
len += getVec(k).nChunks();
}
// Mapping from Chunk# to cluster-node-number
_chunk2Enum = MemoryManager.malloc4(len);
Arrays.fill(_chunk2Enum, -1);
return super.dfork(keys);
}
private FVecDataOut makeDout(ParserSetup localSetup, int chunkOff, int nchunks) {
AppendableVec [] avs = new AppendableVec[localSetup._ncols];
long [] espc = MemoryManager.malloc8(nchunks);
for(int i = 0; i < avs.length; ++i)
avs[i] = new AppendableVec(_vg.vecKey(i + _vecIdStart),espc, chunkOff);
return localSetup._pType == ParserType.SVMLight
?new SVMLightFVecDataOut(_vg,chunkOff,avs,_vecIdStart,chunkOff,enums(_eKey,localSetup._ncols))
:new FVecDataOut(_vg, chunkOff, chunkOff, enums(_eKey,localSetup._ncols), avs);
}
// Called once per file
@Override public void map( Key key ) {
// Get parser setup info for this chunk
ByteVec vec = (ByteVec) getVec(key);
byte [] bits = vec.chunkForChunkIdx(0)._mem;
if(bits == null || bits.length == 0){
assert false:"encountered empty file during multifile parse? should've been filtered already";
return; // should not really get here
}
final int chunkOff = _fileChunkOffsets.get(key)._val;
Compression cpr = Utils.guessCompressionMethod(bits);
CustomParser.ParserSetup localSetup = GuessSetup.guessSetup(Utils.unzipBytes(bits,cpr), _setup,false)._setup;
// Local setup: nearly the same as the global all-files setup, but maybe
// has the header-flag changed.
if(!localSetup.isCompatible(_setup)) {
_parserr = "Conflicting file layouts, expecting: " + _setup + " but found " + localSetup;
return;
}
// Allow dup headers, if they are equals-ignoring-case
boolean has_hdr = _setup._header && localSetup._header;
if( has_hdr ) { // Both have headers?
for( int i = 0; has_hdr && i < localSetup._columnNames.length; ++i )
has_hdr = localSetup._columnNames[i].equalsIgnoreCase(_setup._columnNames[i]);
if( !has_hdr ) // Headers not compatible?
// Then treat as no-headers, i.e., parse it as a normal row
localSetup = new CustomParser.ParserSetup(ParserType.CSV,localSetup._separator, false);
}
// Parse the file
try {
switch( cpr ) {
case NONE:
if( localSetup._pType.parallelParseSupported ) {
DParse dp = new DParse(_vg, localSetup, _vecIdStart, chunkOff,this, vec.nChunks());
addToPendingCount(1);
dp._removeKey = vec._key;
dp.exec(new Frame(vec));
for( int i = 0; i < vec.nChunks(); ++i )
_chunk2Enum[chunkOff + i] = vec.chunkKey(i).home_node().index();
} else {
ParseProgressMonitor pmon = new ParseProgressMonitor(_progress);
_dout[_lo] = streamParse(vec.openStream(pmon), localSetup, makeDout(localSetup,chunkOff,vec.nChunks()), pmon);
for(int i = 0; i < vec.nChunks(); ++i)
_chunk2Enum[chunkOff + i] = H2O.SELF.index();
}
break;
case ZIP: {
// Zipped file; no parallel decompression;
ParseProgressMonitor pmon = new ParseProgressMonitor(_progress);
ZipInputStream zis = new ZipInputStream(vec.openStream(pmon));
ZipEntry ze = zis.getNextEntry(); // Get the *FIRST* entry
// SVMLightFVecDataOut(VectorGroup vg, int cidx, AppendableVec [] avs, int vecIdStart, int chunkOff, Enum [] enums)
// There is at least one entry in zip file and it is not a directory.
if( ze != null && !ze.isDirectory() )
_dout[_lo] = streamParse(zis, localSetup, makeDout(localSetup,chunkOff,vec.nChunks()), pmon);
else zis.close(); // Confused: which zipped file to decompress
// set this node as the one which rpocessed all the chunks
for(int i = 0; i < vec.nChunks(); ++i)
_chunk2Enum[chunkOff + i] = H2O.SELF.index();
break;
}
case GZIP:
// Zipped file; no parallel decompression;
ParseProgressMonitor pmon = new ParseProgressMonitor(_progress);
_dout[_lo] = streamParse(new GZIPInputStream(vec.openStream(pmon)),localSetup,makeDout(localSetup,chunkOff,vec.nChunks()), pmon);
// set this node as the one which processed all the chunks
for(int i = 0; i < vec.nChunks(); ++i)
_chunk2Enum[chunkOff + i] = H2O.SELF.index();
break;
}
} catch( IOException ioe ) {
throw new RuntimeException(ioe);
}
}
// Reduce: combine errors from across files.
// Roll-up other meta data
@Override public void reduce( MultiFileParseTask mfpt ) {
assert this != mfpt;
// Combine parse errors from across files
if( _parserr == null ) _parserr = mfpt._parserr;
else if( mfpt._parserr != null ) _parserr += mfpt._parserr;
// Collect & combine columns across files
if( _dout == null ) _dout = mfpt._dout;
else if(_dout != mfpt._dout) _dout = Utils.append(_dout,mfpt._dout);
if( _chunk2Enum == null ) _chunk2Enum = mfpt._chunk2Enum;
else if(_chunk2Enum != mfpt._chunk2Enum) { // we're sharing global array!
for( int i = 0; i < _chunk2Enum.length; ++i ) {
if( _chunk2Enum[i] == -1 ) _chunk2Enum[i] = mfpt._chunk2Enum[i];
else assert mfpt._chunk2Enum[i] == -1 : Arrays.toString(_chunk2Enum) + " :: " + Arrays.toString(mfpt._chunk2Enum);
}
}
}
private static Enum [] enums(Key eKey, int ncols){
if(!_enums.containsKey(eKey)){
Enum [] enums = new Enum[ncols];
for(int i = 0; i < enums.length; ++i)enums[i] = new Enum();
_enums.putIfAbsent(eKey, enums);
}
return _enums.get(eKey);
}
// ------------------------------------------------------------------------
// Zipped file; no parallel decompression; decompress into local chunks,
// parse local chunks; distribute chunks later.
private FVecDataOut streamParse( final InputStream is, final CustomParser.ParserSetup localSetup, FVecDataOut dout, ParseProgressMonitor pmon) throws IOException {
// All output into a fresh pile of NewChunks, one per column
CustomParser p = localSetup.parser();
// assume 2x inflation rate
//if( localSetup._pType.parallelParseSupported )
if( localSetup._pType.parallelParseSupported )
try{p.streamParse(is, dout, pmon);}catch(IOException e){throw new RuntimeException(e);}
else
try{p.streamParse(is, dout);}catch(Exception e){throw new RuntimeException(e);}
// Parse all internal "chunks", until we drain the zip-stream dry. Not
// real chunks, just flipping between 32K buffers. Fills up the single
// very large NewChunk.
dout.close(_fs);
return dout;
}
// ------------------------------------------------------------------------
private static class DParse extends MRTask2<DParse> {
private final CustomParser.ParserSetup _setup;
private final int _vecIdStart;
private final int _chunkOff; // for multifile parse, offset of the first chunk in the final dataset
private final VectorGroup _vg;
private FVecDataOut _dout;
private final Key _eKey;
final Key _progress;
Key _removeKey;
private transient final MultiFileParseTask _outerMFPT;
final int _nchunks;
private transient NonBlockingSetInt _visited;
private transient long [] _espc;
DParse(VectorGroup vg, CustomParser.ParserSetup setup, int vecIdstart, int startChunkIdx, MultiFileParseTask mfpt, int nchunks) {
super(mfpt);
_vg = vg;
_setup = setup;
_vecIdStart = vecIdstart;
_chunkOff = startChunkIdx;
_outerMFPT = mfpt;
_eKey = mfpt._eKey;
_progress = mfpt._progress;
_nchunks = nchunks;
}
@Override public void setupLocal(){
super.setupLocal();
_espc = MemoryManager.malloc8(_nchunks);
_visited = new NonBlockingSetInt();
}
@Override public void map( Chunk in) {
AppendableVec [] avs = new AppendableVec[_setup._ncols];
for(int i = 0; i < avs.length; ++i)
avs[i] = new AppendableVec(_vg.vecKey(_vecIdStart + i), _espc, _chunkOff);
Enum [] enums = enums(_eKey,_setup._ncols);
// Break out the input & output vectors before the parse loop
// The Parser
FVecDataIn din = new FVecDataIn(in);
FVecDataOut dout;
CustomParser p;
switch(_setup._pType) {
case CSV:
p = new CsvParser(_setup);
dout = new FVecDataOut(_vg,_chunkOff, _chunkOff + in.cidx(),enums,avs);
break;
case SVMLight:
p = new SVMLightParser(_setup);
dout = new SVMLightFVecDataOut(_vg, _chunkOff + in.cidx(), avs, _vecIdStart, _chunkOff, enums);
break;
default:
throw H2O.unimpl();
}
p.parallelParse(in.cidx(),din,dout);
(_dout = dout).close(_fs);
onProgress(in._len, _progress); // Record bytes parsed
// remove parsed data right away (each chunk is used by 2)
final int cidx = in.cidx();
if(!_visited.add(cidx)) {
Value v = H2O.get(in._vec.chunkKey(cidx));
if(v != null && v.isPersisted()) {
v.freePOJO();
v.freeMem();
}
}
if(!_visited.add(cidx+1)) {
Value v = H2O.get(in._vec.chunkKey(cidx+1));
if(v != null && v.isPersisted()) {
v.freePOJO();
v.freeMem();
}
}
}
@Override public void reduce(DParse dp) {
if(_dout == null)_dout = dp._dout;
else _dout.reduce(dp._dout);
}
@Override public void postGlobal() {
super.postGlobal();
_outerMFPT._dout[_outerMFPT._lo] = _dout;
_dout = null; // Reclaim GC eagerly
if(_removeKey != null) UKV.remove(_removeKey);
}
}
// Find & remove all partially built output chunks & vecs
private Futures onExceptionCleanup(Futures fs) {
int nchunks = _chunk2Enum.length;
int ncols = _setup._ncols;
for( int i = 0; i < ncols; ++i ) {
Key vkey = _vg.vecKey(_vecIdStart + i);
DKV.remove(vkey,fs);
for( int c = 0; c < nchunks; ++c )
DKV.remove(Vec.chunkKey(vkey,c),fs);
}
cancel(true);
return fs;
}
}
// ------------------------------------------------------------------------
/** Parsed data output specialized for fluid vecs.
* @author tomasnykodym
*/
static class FVecDataOut extends Iced implements CustomParser.StreamDataOut {
protected transient NewChunk [] _nvs;
public final int _chunkOff;
protected AppendableVec []_vecs;
private transient final Enum [] _enums;
protected byte [] _ctypes;
long _nLines;
int _col = -1;
final int _cidx;
int _nChunks;
boolean _closedVecs = false;
private final VectorGroup _vg;
static final protected byte UCOL = 0; // unknown col type
static final protected byte NCOL = 1; // numeric col type
static final protected byte ECOL = 2; // enum col type
static final protected byte TCOL = 3; // time col typ
static final protected byte ICOL = 4; // UUID col typ
public int nChunks(){return _nChunks;}
// private static AppendableVec[] newAppendables(int n, VectorGroup vg, int vecIdStart){
// AppendableVec [] apps = new AppendableVec[n];
//
// for(int i = 0; i < n; ++i)
// apps[i] = new AppendableVec(vg.vecKey(vecIdStart + i));
// return apps;
// }
// public FVecDataOut(VectorGroup vg, int cidx, int ncols, int vecIdStart, Enum[] enums){
// this(vg,cidx,ncols,vecIdStart,enums,newAppendables(ncols,vg,vecIdStart));
// }
public FVecDataOut(VectorGroup vg, int chunkOff, int cidx, Enum [] enums, AppendableVec [] appendables){
assert cidx - chunkOff >= 0:"incompatible cidx/chunkOff " + chunkOff + ", " + cidx;
_vecs = appendables;
_chunkOff = chunkOff;
_enums = enums;
_cidx = cidx;
_vg = vg;
_ctypes = MemoryManager.malloc1(appendables.length);
_nvs = new NewChunk[appendables.length];
for(int i = 0; i < appendables.length; ++i)
_nvs[i] = (NewChunk)_vecs[i].chunkForChunkIdx(_cidx);
}
@Override public FVecDataOut reduce(StreamDataOut sdout){
FVecDataOut dout = (FVecDataOut)sdout;
_nChunks += dout._nChunks;
if( dout!=null && _vecs != dout._vecs){
if(dout._vecs.length > _vecs.length){
AppendableVec [] v = _vecs;
_vecs = dout._vecs;
for(int i = 1; i < _vecs.length; ++i)
_vecs[i]._espc = _vecs[0]._espc;
dout._vecs = v;
}
for(int i = 0; i < dout._vecs.length; ++i)
_vecs[i].reduce(dout._vecs[i]);
}
return this;
}
@Override public FVecDataOut close(){
Futures fs = new Futures();
close(fs);
fs.blockForPending();
return this;
}
public void check(){
if(_nvs != null) for(NewChunk nv:_nvs)
assert (nv._len == _nLines):"unexpected number of lines in NewChunk, got " + nv._len + ", but expected " + _nLines;
}
@Override public FVecDataOut close(final Futures fs){
if( _nvs == null ) return this; // Might call close twice
for(NewChunk nc:_nvs)
assert nc._len == _nLines:"incompatible lengths after parsing chunk, " + _nLines + " != " + nc._len;
RecursiveAction [] rs = new RecursiveAction[_nvs.length];
for(int i = 0; i < _nvs.length; ++i) {
final int fi = i;
rs[i] = new RecursiveAction() {
@Override
protected void compute() {
_nvs[fi].close(_cidx, fs);
_nvs[fi] = null; // Free for GC
}
};
}
ForkJoinTask.invokeAll(rs);
_nChunks++;
_nvs = null; // Free for GC
return this;
}
@Override public FVecDataOut nextChunk(){
return new FVecDataOut(_vg, _chunkOff,_cidx+1,_enums, _vecs);
}
// protected Vec [] closeVecs(){
// Futures fs = new Futures();
// _closedVecs = true;
// Vec [] res = new Vec[_vecs.length];
// for(int i = 0; i < _vecs.length; ++i)
// res[i] = _vecs[i].close(fs);
// _vecs = null; // Free for GC
// fs.blockForPending();
// return res;
// }
@Override public void newLine() {
if(_col >= 0){
++_nLines;
for(int i = _col+1; i < _vecs.length; ++i)
addInvalidCol(i);
}
_col = -1;
}
@Override public void addNumCol(int colIdx, long number, int exp) {
if( colIdx < _vecs.length ) {
_nvs[_col = colIdx].addNum(number, exp);
if(_ctypes[colIdx] == UCOL ) _ctypes[colIdx] = NCOL;
}
}
@Override public void addInvalidCol(int colIdx) {
if(colIdx < _vecs.length) _nvs[_col = colIdx].addNA();
}
@Override public final boolean isString(int colIdx) { return false; }
@Override public final void addStrCol(int colIdx, ValueString str) {
if(colIdx < _nvs.length){
if(_ctypes[colIdx] == NCOL){ // support enforced types
addInvalidCol(colIdx);
return;
}
if(_ctypes[colIdx] == UCOL && ParseTime.attemptTimeParse(str) > 0)
_ctypes[colIdx] = TCOL;
if( _ctypes[colIdx] == UCOL ) { // Attempt UUID parse
int old = str.get_off();
ParseTime.attemptUUIDParse0(str);
ParseTime.attemptUUIDParse1(str);
if( str.get_off() != -1 ) _ctypes[colIdx] = ICOL;
str.setOff(old);
}
if( _ctypes[colIdx] == TCOL ) {
long l = ParseTime.attemptTimeParse(str);
if( l == Long.MIN_VALUE ) addInvalidCol(colIdx);
else {
int time_pat = ParseTime.decodePat(l); // Get time pattern
l = ParseTime.decodeTime(l); // Get time
addNumCol(colIdx, l, 0); // Record time in msec
_nvs[_col]._timCnt[time_pat]++; // Count histo of time parse patterns
}
} else if( _ctypes[colIdx] == ICOL ) { // UUID column? Only allow UUID parses
long lo = ParseTime.attemptUUIDParse0(str);
long hi = ParseTime.attemptUUIDParse1(str);
if( str.get_off() == -1 ) { lo = C16Chunk._LO_NA; hi = C16Chunk._HI_NA; }
if( colIdx < _vecs.length ) _nvs[_col = colIdx].addUUID(lo, hi);
} else if(!_enums[_col = colIdx].isKilled()) {
// store enum id into exponent, so that it will be interpreted as NA if compressing as numcol.
int id = _enums[colIdx].addKey(str);
if(_ctypes[colIdx] == UCOL && id > 1) _ctypes[colIdx] = ECOL;
_nvs[colIdx].addEnum(id);
} else // turn the column into NAs by adding value overflowing Enum.MAX_SIZE
_nvs[colIdx].addEnum(Integer.MAX_VALUE);
} //else System.err.println("additional column (" + colIdx + ":" + str + ") on line " + linenum());
}
/** Adds double value to the column. */
@Override public void addNumCol(int colIdx, double value) {
if (Double.isNaN(value)) {
addInvalidCol(colIdx);
} else {
double d= value;
int exp = 0;
long number = (long)d;
while (number != d) {
d = d * 10;
--exp;
number = (long)d;
}
addNumCol(colIdx, number, exp);
}
}
@Override public void setColumnNames(String [] names){}
@Override public final void rollbackLine() {}
@Override public void invalidLine(String err) { newLine(); }
@Override public void invalidValue(int line, int col) {}
}
// ------------------------------------------------------------------------
/** Parser data in taking data from fluid vec chunk.
* @author tomasnykodym
*/
private static class FVecDataIn implements CustomParser.DataIn {
final Vec _vec;
Chunk _chk;
int _idx;
final long _firstLine;
public FVecDataIn(Chunk chk){
_chk = chk;
_idx = _chk.cidx();
_firstLine = _chk._start;
_vec = chk._vec;
}
@Override public byte[] getChunkData(int cidx) {
if(cidx != _idx)
_chk = cidx < _vec.nChunks()?_vec.chunkForChunkIdx(_idx = cidx):null;
return (_chk == null)?null:_chk._mem;
}
@Override public int getChunkDataStart(int cidx) { return -1; }
@Override public void setChunkDataStart(int cidx, int offset) { }
}
}