package water.persist; import java.io.*; import java.net.SocketTimeoutException; import java.net.URI; import java.util.ArrayList; import java.util.concurrent.Callable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.*; import org.apache.hadoop.fs.FileSystem; import water.*; import water.Job.ProgressMonitor; import water.api.Constants; import water.api.Constants.Extensions; import water.fvec.*; import water.util.*; import water.util.Log.Tag.Sys; import com.google.common.base.Strings; import com.google.common.io.ByteStreams; import dontweave.gson.*; public final class PersistHdfs extends Persist { public static final Configuration CONF; private final Path _iceRoot; // Returns String with path for given key. private static String getPathForKey(Key k) { final int off = k._kb[0]==Key.DVEC ? Vec.KEY_PREFIX_LEN : 0; return new String(k._kb,off,k._kb.length-off); } static { Log.POST(4001, ""); Configuration conf = null; Log.POST(4002, ""); if( H2O.OPT_ARGS.hdfs_config != null ) { Log.POST(4003, ""); conf = new Configuration(); File p = new File(H2O.OPT_ARGS.hdfs_config); if( !p.exists() ) Log.die("Unable to open hdfs configuration file " + p.getAbsolutePath()); conf.addResource(new Path(p.getAbsolutePath())); Log.debug(Sys.HDFS_, "resource ", p.getAbsolutePath(), " added to the hadoop configuration"); Log.info(Sys.HDFS_, "resource ", p.getAbsolutePath(), " added to the hadoop configuration"); Log.POST(4004, ""); } else { Log.POST(4005, ""); conf = new Configuration(); Log.POST(4006, ""); if( !Strings.isNullOrEmpty(H2O.OPT_ARGS.hdfs) ) { // setup default remote Filesystem - for version 0.21 and higher Log.POST(4007, ""); conf.set("fs.defaultFS", H2O.OPT_ARGS.hdfs); // To provide compatibility with version 0.20.0 it is necessary to setup the property // fs.default.name which was in newer version renamed to 'fs.defaultFS' Log.POST(4008, ""); conf.set("fs.default.name", H2O.OPT_ARGS.hdfs); } } Log.POST(4009, ""); CONF = conf; Log.POST(4010, ""); } // Loading HDFS files PersistHdfs() { Log.POST(4000, ""); _iceRoot = null; } // Loading/Writing ice to HDFS PersistHdfs(URI uri) { try { _iceRoot = new Path(uri + "/ice" + H2O.SELF_ADDRESS.getHostAddress() + "-" + H2O.API_PORT); // Make the directory as-needed FileSystem fs = FileSystem.get(_iceRoot.toUri(), CONF); fs.mkdirs(_iceRoot); } catch( Exception e ) { throw Log.errRTExcept(e); } } @Override public String getPath() { return _iceRoot != null ? _iceRoot.toString() : null; } @Override public void loadExisting() { // TODO? throw new UnsupportedOperationException(); } @Override public void clear() { assert this == getIce(); run(new Callable() { @Override public Object call() throws Exception { FileSystem fs = FileSystem.get(_iceRoot.toUri(), CONF); fs.delete(_iceRoot, true); return null; } }, false, 0); } private static class H2OHdfsInputStream extends RIStream { final FileSystem _fs; final Path _path; public H2OHdfsInputStream(Path p, long offset, ProgressMonitor pmon) throws IOException { super(offset, pmon); _path = p; _fs = FileSystem.get(p.toUri(), CONF); setExpectedSz(_fs.getFileStatus(p).getLen()); open(); } @Override protected InputStream open(long offset) throws IOException { FSDataInputStream is = _fs.open(_path); is.seek(offset); return is; } } public static InputStream openStream(Key k, ProgressMonitor pmon) throws IOException { H2OHdfsInputStream res = null; Path p = new Path(k.toString()); try { res = new H2OHdfsInputStream(p, 0, pmon); } catch( IOException e ) { try { Thread.sleep(1000); } catch( Exception ex ) {} Log.warn("Error while opening HDFS key " + k.toString() + ", will wait and retry."); res = new H2OHdfsInputStream(p, 0, pmon); } return res; } @Override public byte[] load(final Value v) { final byte[] b = MemoryManager.malloc1(v._max); long skip = 0; Key k = v._key; if(k._kb[0] == Key.DVEC) skip = FileVec.chunkOffset(k); // The offset final Path p = _iceRoot == null?new Path(getPathForKey(k)):new Path(_iceRoot, getIceName(v)); final long skip_ = skip; run(new Callable() { @Override public Object call() throws Exception { FileSystem fs = FileSystem.get(p.toUri(), CONF); FSDataInputStream s = null; try { s = fs.open(p); // NOTE: // The following line degrades performance of HDFS load from S3 API: s.readFully(skip,b,0,b.length); // Google API's simple seek has better performance // Load of 300MB file via Google API ~ 14sec, via s.readFully ~ 5min (under the same condition) ByteStreams.skipFully(s, skip_); ByteStreams.readFully(s, b); assert v.isPersisted(); } finally { Utils.close(s); } return null; } }, true, v._max); return b; } @Override public void store(Value v) { // Should be used only if ice goes to HDFS assert this == getIce(); assert !v.isPersisted(); byte[] m = v.memOrLoad(); assert (m == null || m.length == v._max); // Assert not saving partial files store(new Path(_iceRoot, getIceName(v)), m); v.setdsk(); // Set as write-complete to disk } public static void store(final Path path, final byte[] data) { run(new Callable() { @Override public Object call() throws Exception { FileSystem fs = FileSystem.get(path.toUri(), CONF); fs.mkdirs(path.getParent()); FSDataOutputStream s = fs.create(path); try { s.write(data); } finally { s.close(); } return null; } }, false, data.length); } @Override public void delete(final Value v) { assert this == getIce(); assert !v.isPersisted(); // Upper layers already cleared out run(new Callable() { @Override public Object call() throws Exception { Path p = new Path(_iceRoot, getIceName(v)); FileSystem fs = FileSystem.get(p.toUri(), CONF); fs.delete(p, true); return null; } }, false, 0); } private static class Size { int _value; } private static void run(Callable c, boolean read, int size) { // Count all i/o time from here, including all retry overheads long start_io_ms = System.currentTimeMillis(); while( true ) { try { long start_ns = System.nanoTime(); // Blocking i/o call timing - without counting repeats c.call(); TimeLine.record_IOclose(start_ns, start_io_ms, read ? 1 : 0, size, Value.HDFS); break; // Explicitly ignore the following exceptions but // fail on the rest IOExceptions } catch( EOFException e ) { ignoreAndWait(e, false); } catch( SocketTimeoutException e ) { ignoreAndWait(e, false); } catch( IOException e ) { // Newer versions of Hadoop derive S3Exception from IOException if (e.getClass().getName().contains("S3Exception")) { ignoreAndWait(e, false); } else { ignoreAndWait(e, true); } } catch( RuntimeException e ) { // Older versions of Hadoop derive S3Exception from RuntimeException if (e.getClass().getName().contains("S3Exception")) { ignoreAndWait(e, false); } else { throw Log.errRTExcept(e); } } catch( Exception e ) { throw Log.errRTExcept(e); } } } private static void ignoreAndWait(final Exception e, boolean printException) { H2O.ignore(e, "Hit HDFS reset problem, retrying...", printException); try { Thread.sleep(500); } catch( InterruptedException ie ) {} } /* * Load all files in a folder. */ public static void addFolder(Path p, JsonArray succeeded, JsonArray failed) throws IOException { FileSystem fs = FileSystem.get(p.toUri(), PersistHdfs.CONF); if(!fs.exists(p)){ JsonObject o = new JsonObject(); o.addProperty(Constants.FILE, p.toString()); o.addProperty(Constants.ERROR, "Path does not exist!"); failed.add(o); return; } addFolder(fs, p, succeeded, failed); } public static void addFolder2(Path p, ArrayList<String> keys,ArrayList<String> failed) throws IOException { FileSystem fs = FileSystem.get(p.toUri(), PersistHdfs.CONF); if(!fs.exists(p)){ failed.add("Path does not exist: '" + p.toString() + "'"); return; } addFolder2(fs, p, keys, failed); } private static void addFolder2(FileSystem fs, Path p, ArrayList<String> keys, ArrayList<String> failed) { try { if( fs == null ) return; Futures futures = new Futures(); for( FileStatus file : fs.listStatus(p) ) { Path pfs = file.getPath(); if( file.isDir() ) { addFolder2(fs, pfs, keys, failed); } else { long size = file.getLen(); Key res; if( pfs.getName().endsWith(Extensions.JSON) ) { throw H2O.unimpl(); } else if( pfs.getName().endsWith(Extensions.HEX) ) { // Hex file? throw H2O.unimpl(); } else { Key k = null; keys.add((k = HdfsFileVec.make(file, futures)).toString()); Log.info("PersistHdfs: DKV.put(" + k + ")"); } } } } catch( Exception e ) { Log.err(e); failed.add(p.toString()); } } private static void addFolder(FileSystem fs, Path p, JsonArray succeeded, JsonArray failed) { try { if( fs == null ) return; for( FileStatus file : fs.listStatus(p) ) { Path pfs = file.getPath(); if( file.isDir() ) { addFolder(fs, pfs, succeeded, failed); } else { Key k = Key.make(pfs.toString()); long size = file.getLen(); Value val = new Value(k, (int) size, Value.HDFS); // Plain Value val.setdsk(); DKV.put(k, val); Log.info("PersistHdfs: DKV.put(" + k + ")"); JsonObject o = new JsonObject(); o.addProperty(Constants.KEY, k.toString()); o.addProperty(Constants.FILE, pfs.toString()); o.addProperty(Constants.VALUE_SIZE, file.getLen()); succeeded.add(o); } } } catch( Exception e ) { Log.err(e); JsonObject o = new JsonObject(); o.addProperty(Constants.FILE, p.toString()); o.addProperty(Constants.ERROR, e.getMessage()); failed.add(o); } } }