package water.persist; import water.H2O; import water.Key; import water.MRTask; import water.Value; import water.exceptions.H2OIllegalArgumentException; import water.fvec.UploadFileVec; import water.util.FileUtils; import water.util.Log; import water.persist.Persist.PersistEntry; import java.io.*; import java.net.HttpURLConnection; import java.net.URI; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Pattern; import java.util.regex.Matcher; import static water.H2O.OptArgs.SYSTEM_PROP_PREFIX; /** * One true persistence manager which hides the implementations from H2O. * In particular, HDFS support or S3 support may or may not exist depending * on what is on the classpath. */ public class PersistManager { final static public int MAX_BACKENDS = 8; /** Property which enable HDFS as default fallback persistent layer. For example, * if swift fs is regirestered properly under HDFS and user specifies swift based URI, the persist * layer forwards the request through HDFS API. */ final static String PROP_ENABLE_HDFS_FALLBACK = SYSTEM_PROP_PREFIX + "persist.enable.hdfs.fallback"; /** Persistence schemes; used as file prefixes eg "hdfs://some_hdfs_path/some_file" */ public static class Schemes { public static final String FILE = "file"; public static final String HDFS = "hdfs"; public static final String S3 = "s3"; public static final String S3N = "s3n"; public static final String S3A = "s3a"; public static final String NFS = "nfs"; } public static class PersistStatsEntry { public PersistStatsEntry() { store_count = new AtomicLong(); store_bytes = new AtomicLong(); delete_count = new AtomicLong(); load_count = new AtomicLong(); load_bytes = new AtomicLong(); } public AtomicLong store_count; public AtomicLong store_bytes; public AtomicLong delete_count; public AtomicLong load_count; public AtomicLong load_bytes; } private Persist[] I; private PersistStatsEntry[] stats; public PersistStatsEntry[] getStats() { return stats; } public boolean isHdfsPath(String path) { String s = path.toLowerCase(); if (s.startsWith("hdfs:") || s.startsWith("s3:") || s.startsWith("s3n:") || s.startsWith("s3a:") || s.startsWith("maprfs:") || useHdfsAsFallback() && I[Value.HDFS] != null && I[Value.HDFS].canHandle(path)) { return true; } return false; } private void validateHdfsConfigured() { if (I[Value.HDFS] == null) { throw new H2OIllegalArgumentException("HDFS, S3, S3N, and S3A support is not configured"); } } public PersistManager(URI iceRoot) { I = new Persist[MAX_BACKENDS]; stats = new PersistStatsEntry[MAX_BACKENDS]; for (int i = 0; i < stats.length; i++) { stats[i] = new PersistStatsEntry(); } if (iceRoot == null) { Log.err("ice_root must be specified. Exiting."); H2O.exit(1); } Persist ice = null; boolean windowsPath = iceRoot.toString().matches("^[a-zA-Z]:.*"); if (windowsPath) { ice = new PersistFS(new File(iceRoot.toString())); } else if ((iceRoot.getScheme() == null) || Schemes.FILE.equals(iceRoot.getScheme())) { ice = new PersistFS(new File(iceRoot.getPath())); } else if( Schemes.HDFS.equals(iceRoot.getScheme()) ) { Log.err("HDFS ice_root not yet supported. Exiting."); H2O.exit(1); // I am not sure anyone actually ever does this. // H2O on Hadoop launches use local disk for ice root. // This has a chance to work, but turn if off until it gets tested. // // try { // Class klass = Class.forName("water.persist.PersistHdfs"); // java.lang.reflect.Constructor constructor = klass.getConstructor(new Class[]{URI.class}); // ice = (Persist) constructor.newInstance(iceRoot); // } catch (Exception e) { // Log.err("Could not initialize HDFS"); // throw new RuntimeException(e); // } } I[Value.ICE ] = ice; I[Value.NFS ] = new PersistNFS(); try { Class klass = Class.forName("water.persist.PersistHdfs"); java.lang.reflect.Constructor constructor = klass.getConstructor(); I[Value.HDFS] = (Persist) constructor.newInstance(); Log.info("HDFS subsystem successfully initialized"); } catch (Throwable ignore) { Log.info("HDFS subsystem not available"); } try { Class klass = Class.forName("water.persist.PersistS3"); java.lang.reflect.Constructor constructor = klass.getConstructor(); I[Value.S3] = (Persist) constructor.newInstance(); Log.info("S3 subsystem successfully initialized"); } catch (Throwable ignore) { Log.info("S3 subsystem not available"); } } public void store(int backend, Value v) throws IOException { stats[backend].store_count.incrementAndGet(); I[backend].store(v); } public void delete(int backend, Value v) { stats[backend].delete_count.incrementAndGet(); I[backend].delete(v); } public byte[] load(int backend, Value v) throws IOException { stats[backend].load_count.incrementAndGet(); byte[] arr = I[backend].load(v); stats[backend].load_bytes.addAndGet(arr.length); return arr; } /** Get the current Persist flavor for user-mode swapping. */ public Persist getIce() { return I[Value.ICE]; } /** Convert given URI into a specific H2O key representation. * * The representation depends on persistent backend, since it will * deduce file location from the key content. * * The method will look at scheme of URI and based on it, it will * ask a backend to provide a conversion to a key (i.e., URI with scheme * 'hdfs' will be forwared to HDFS backend). * * @param uri file location * @return a key encoding URI * @throws IOException in the case of uri conversion problem * @throws water.exceptions.H2OIllegalArgumentException in case of unsupported scheme */ public final Key anyURIToKey(URI uri) throws IOException { Key ikey = null; String scheme = uri.getScheme(); if("s3".equals(scheme)) { ikey = I[Value.S3].uriToKey(uri); } else if ("hdfs".equals(scheme)) { ikey = I[Value.HDFS].uriToKey(uri); } else if ("s3".equals(scheme) || "s3n".equals(scheme) || "s3a".equals(scheme)) { ikey = I[Value.HDFS].uriToKey(uri); } else if ("file".equals(scheme) || scheme == null) { ikey = I[Value.NFS].uriToKey(uri); } else if (useHdfsAsFallback() && I[Value.HDFS].canHandle(uri.toString())) { ikey = I[Value.HDFS].uriToKey(uri); } else { throw new H2OIllegalArgumentException("Unsupported schema '" + scheme + "' for given uri " + uri); } return ikey; } private static boolean httpUrlExists(String URLName){ try { HttpURLConnection con = (HttpURLConnection) new URL(URLName).openConnection(); con.setInstanceFollowRedirects(false); con.setRequestMethod("HEAD"); return (con.getResponseCode() == HttpURLConnection.HTTP_OK); } catch (Exception e) { return false; } } /** * Calculate typeahead matches for src * * @param filter Source string to match for typeahead * @param limit Max number of entries to return * @return List of matches */ public List<String> calcTypeaheadMatches(String filter, int limit) { String s = filter.toLowerCase(); if (s.startsWith("http:") || s.startsWith("https:")) { if (httpUrlExists(filter)) { ArrayList<String> arrayList = new ArrayList<>(); arrayList.add(filter); return arrayList; } else { return new ArrayList<>(); } } else if(s.startsWith("s3://")) { return I[Value.S3].calcTypeaheadMatches(filter, limit); } else if (s.startsWith("hdfs:") || s.startsWith("s3n:") || s.startsWith("s3a:") || s.startsWith("maprfs:") || useHdfsAsFallback() && I[Value.HDFS] != null && I[Value.HDFS].canHandle(s)) { if (I[Value.HDFS] == null) { throw new H2OIllegalArgumentException("HDFS, S3, S3N, and S3A support is not configured"); } return I[Value.HDFS].calcTypeaheadMatches(filter, limit); } return I[Value.NFS].calcTypeaheadMatches(filter, limit); } /** * From a path produce a list of files and keys for parsing. * * Use as follows: * * ArrayList<String> files = new ArrayList(); * ArrayList<String> keys = new ArrayList(); * ArrayList<String> fails = new ArrayList(); * ArrayList<String> dels = new ArrayList(); * importFiles(importFiles.path, files, keys, fails, dels); * * @param path (Input) Path to import data from * @param pattern (Input) Regex pattern to match files by * @param files (Output) List of files found * @param keys (Output) List of keys corresponding to files * @param fails (Output) List of failed files which mismatch among nodes * @param dels (Output) I don't know what this is */ public void importFiles(String path, String pattern, ArrayList<String> files, ArrayList<String> keys, ArrayList<String> fails, ArrayList<String> dels) { URI uri = FileUtils.getURI(path); String scheme = uri.getScheme(); if (scheme == null || "file".equals(scheme)) { I[Value.NFS].importFiles(path, pattern, files, keys, fails, dels); } else if ("http".equals(scheme) || "https".equals(scheme)) { try { java.net.URL url = new URL(path); Key destination_key = Key.make(path); java.io.InputStream is = url.openStream(); UploadFileVec.ReadPutStats stats = new UploadFileVec.ReadPutStats(); UploadFileVec.readPut(destination_key, is, stats); files.add(path); keys.add(destination_key.toString()); } catch( Throwable e) { fails.add(path); // Fails for e.g. broken sockets silently swallow exceptions and just record the failed path } } else if ("s3".equals(scheme)) { if (I[Value.S3] == null) throw new H2OIllegalArgumentException("S3 support is not configured"); I[Value.S3].importFiles(path, pattern, files, keys, fails, dels); } else if ("hdfs".equals(scheme) || "s3n:".equals(scheme) || "s3a:".equals(scheme) || "maprfs:".equals(scheme) || (useHdfsAsFallback() && I[Value.HDFS] != null && I[Value.HDFS].canHandle(path))) { if (I[Value.HDFS] == null) throw new H2OIllegalArgumentException("HDFS, S3N, and S3A support is not configured"); I[Value.HDFS].importFiles(path, pattern, files, keys, fails, dels); } if(pattern != null && !pattern.isEmpty()) { files.retainAll(matchPattern(path,files,pattern)); //New files ArrayList after matching pattern of choice keys.retainAll(matchPattern(path,keys,pattern)); //New keys ArrayList after matching pattern of choice //New fails ArrayList after matching pattern of choice. Only show failures that match pattern if(!fails.isEmpty()) { fails.retainAll(matchPattern(path, fails, pattern)); } } return; } // ------------------------------- // Node Persistent Storage helpers // ------------------------------- // Reads public String getHdfsHomeDirectory() { if (I[Value.HDFS] == null) { return null; } return I[Value.HDFS].getHomeDirectory(); } public PersistEntry[] list(String path) { if (isHdfsPath(path)) { validateHdfsConfigured(); PersistEntry[] arr = I[Value.HDFS].list(path); return arr; } File dir = new File(path); File[] files = dir.listFiles(); if (files == null) { return new PersistEntry[0]; } ArrayList<PersistEntry> arr = new ArrayList<>(); for (File f : files) { PersistEntry entry = new PersistEntry(f.getName(), f.length(), f.lastModified()); arr.add(entry); } return arr.toArray(new PersistEntry[arr.size()]); } public boolean exists(String path) { if (isHdfsPath(path)) { validateHdfsConfigured(); boolean b = I[Value.HDFS].exists(path); return b; } File f = new File(path); return f.exists(); } public boolean isDirectory(String path) { if (isHdfsPath(path)) { validateHdfsConfigured(); boolean b = I[Value.HDFS].isDirectory(path); return b; } File f = new File(path); return f.isDirectory(); } /** * Checks whether a given path is either an empty directory or it doesn't yet exist. * This is trivial if the filesystem where the path leads is distributed. * If we are working with a local filesystem we need to make sure that this property * is satisfied on all of the nodes. * @param path path we want to check * @return true the path is an empty or non-existent directory everywhere, false otherwise */ public boolean isEmptyDirectoryAllNodes(String path) { if (isHdfsPath(path)) { validateHdfsConfigured(); if (! I[Value.HDFS].exists(path)) return true; if (! I[Value.HDFS].isDirectory(path)) return false; PersistEntry[] content = I[Value.HDFS].list(path); return (content == null) || (content.length == 0); } return new CheckLocalDirTask(path).doAllNodes()._result; } private static class CheckLocalDirTask extends MRTask<CheckLocalDirTask> { String _path; // OUT boolean _result; CheckLocalDirTask(String _path) { this._path = _path; } @Override public void reduce(CheckLocalDirTask mrt) { _result = _result && mrt._result; } @Override protected void setupLocal() { File f = new File(_path); if (! f.exists()) _result = true; else if (f.isDirectory()) { File[] content = f.listFiles(); _result = (content != null) && (content.length == 0); } else _result = false; } } public long length(String path) { if (isHdfsPath(path)) { validateHdfsConfigured(); long l = I[Value.HDFS].length(path); return l; } File f = new File(path); if (! f.exists()) { throw new IllegalArgumentException("File not found (" + path + ")"); } return f.length(); } public InputStream open(String path) { if (isHdfsPath(path)) { validateHdfsConfigured(); InputStream os = I[Value.HDFS].open(path); return os; } try { File f = new File(path); return new FileInputStream(f); } catch (FileNotFoundException e) { throw new IllegalArgumentException("File not found (" + path + ")"); } catch (Exception e) { throw new RuntimeException(e); } } // Writes public boolean mkdirs(String path) { if (isHdfsPath(path)) { validateHdfsConfigured(); boolean b = I[Value.HDFS].mkdirs(path); return b; } File f = new File(path); boolean b = f.mkdirs(); return b; } public boolean rename(String fromPath, String toPath) { if (isHdfsPath(fromPath) || isHdfsPath(toPath)) { validateHdfsConfigured(); boolean b = I[Value.HDFS].rename(fromPath, toPath); return b; } File f = new File(fromPath); File t = new File(toPath); boolean b = f.renameTo(t); return b; } public OutputStream create(String path, boolean overwrite) { if (isHdfsPath(path)) { validateHdfsConfigured(); return I[Value.HDFS].create(path, overwrite); } try { if (! overwrite) { File f = new File(path); if (f.exists()) { throw new IllegalArgumentException("File already exists (" + path + ")"); } } return new FileOutputStream(path); } catch (Exception e) { throw new RuntimeException(e); } } public boolean delete(String path) { if (isHdfsPath(path)) { validateHdfsConfigured(); boolean b = I[Value.HDFS].delete(path); return b; } File f = new File(path); boolean b = f.delete(); return b; } public Persist getPersistForURI(URI uri) { String scheme = uri.getScheme(); boolean windowsPath = scheme.matches("^[a-zA-Z]$"); if (windowsPath) { return I[Value.ICE]; } if (scheme != null) { switch (scheme) { case Schemes.FILE: return I[Value.ICE]; // Local FS case Schemes.HDFS: case Schemes.S3N: case Schemes.S3A: return I[Value.HDFS]; case Schemes.S3: return I[Value.S3]; default: if (useHdfsAsFallback() && I[Value.HDFS] != null && I[Value.HDFS].canHandle(uri.toString())) { return I[Value.HDFS]; } else { throw new IllegalArgumentException("Cannot find persist manager for scheme " + scheme); } } } else { return I[Value.ICE]; } } /** * Finds all entries in the list that matches the regex * @param prefix The substring to extract before pattern matching * @param fileList The list of strings to check * @param matchStr The regular expression to use on the string after prefix * @return list containing the matching entries */ public ArrayList<String> matchPattern(String prefix, ArrayList<String> fileList, String matchStr){ ArrayList<String> result = new ArrayList<String>(); Pattern pattern = Pattern.compile(matchStr); if (matchStr != null) { for(String s : fileList){ Matcher matcher = pattern.matcher(afterPrefix(s,prefix)); if (matcher.find()) { result.add(s); } } } return result; } /** * Returns the part of the string that occurs after the first index of the substring * @param wholeString A string that needs to be subsetted * @param substring The substring to extract * @return string after substring */ private static String afterPrefix(String wholeString , String substring) { // Returns a substring containing all characters after a string. int posSubstring = wholeString.lastIndexOf(substring); if (posSubstring == -1) { return ""; } int adjustedPosSubstring = posSubstring + substring.length(); if (adjustedPosSubstring >= wholeString.length()) { return ""; } return wholeString.substring(adjustedPosSubstring); } /** Should HDFS persist layer be used as default persist layer * for unknown URL schema. * @return true if HDFS should handle unknown URL schema. */ static boolean useHdfsAsFallback() { return System.getProperty(PROP_ENABLE_HDFS_FALLBACK, "true").equals("true"); } }