package water.api; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.model.ObjectListing; import com.amazonaws.services.s3.model.S3ObjectSummary; import org.apache.hadoop.fs.Path; import tachyon.client.TachyonFS; import tachyon.org.apache.thrift.TException; import tachyon.thrift.ClientFileInfo; import water.*; import water.api.RequestServer.API_VERSION; import water.fvec.*; import water.persist.*; import water.util.*; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class ImportFiles2 extends Request2 { static final int API_WEAVER=1; // This file has auto-gen'd doc & json fields static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code. // This Request supports the HTML 'GET' command, and this is the help text // for GET. static final String DOC_GET = "Map a file from the source (either localhost filesystem, HDFS, or S3) into H2O memory. Data is "+ "loaded lazily, when the Key is read (usually in a Parse2 command, to build " + "a Frame key). (Warning: Every host in the cluster must have this file visible locally!)"; protected String parseLink(String k, String txt) { return Parse2.link(k, txt); } String parse() { return "Parse2.query"; } @Override public API_VERSION[] supportedVersions() { return SUPPORTS_ONLY_V2; } @API(help="Path to file/folder on either local disk/hdfs/s3",required=true,filter=GeneralFile.class,gridable=false) String path; @API(help="Common prefix for all successfully imported file keys") String prefix; @API(help="successfully imported files") String [] files; @API(help="keys of imported files") String [] keys; @API(help="files that failed to load") String [] fails; @API(help="Prior Keys that matched a prefix of the imported path, and were removed prior to (re)importing") String[] dels; public static Key[] importPath(String path){ File f = new File(path); assert f.exists():"file not found: " + f.getAbsolutePath(); ImportFiles2 imp = new ImportFiles2(); imp.path = path; imp.serve(); Key [] res = new Key[imp.keys.length]; for(int i = 0; i < res.length; ++i) res[i] = Key.make(imp.keys[i]); return res; } /** * Iterates over fields and their annotations, and creates argument handlers. */ @Override protected void registered(API_VERSION version) { super.registered(version); } @Override protected Response serve() { try{ if(path != null){ String p2 = path.toLowerCase(); if( false ) ; else if( p2.startsWith("hdfs://" ) ) serveHdfs(); else if( p2.startsWith("s3n://" ) ) serveHdfs(); else if( p2.startsWith("maprfs:/" ) ) serveHdfs(); // "maprfs:/datasets" is legal else if( p2.startsWith("s3://" ) ) serveS3(); else if( p2.startsWith("http://" ) ) serveHttp(); else if( p2.startsWith("https://") ) serveHttp(); else if( p2.startsWith("tachyon://")) serveTachyon(); else serveLocalDisk(); } return Response.done(this); } catch( Throwable e ) { return Response.error(e); } } protected void serveHdfs() throws IOException{ if (isBareS3NBucketWithoutTrailingSlash(path)) { path += "/"; } Log.info("ImportHDFS processing (" + path + ")"); ArrayList<String> succ = new ArrayList<String>(); ArrayList<String> fail = new ArrayList<String>(); PersistHdfs.addFolder2(new Path(path), succ, fail); keys = succ.toArray(new String[succ.size()]); files = keys; fails = fail.toArray(new String[fail.size()]); this.prefix = getCommonPrefix(keys); DKV.write_barrier(); } protected void serveS3(){ Futures fs = new Futures(); assert path.startsWith("s3://"); path = path.substring(5); int bend = path.indexOf('/'); if(bend == -1)bend = path.length(); String bucket = path.substring(0,bend); String prefix = bend < path.length()?path.substring(bend+1):""; AmazonS3 s3 = PersistS3.getClient(); if( !s3.doesBucketExist(bucket) ) throw new IllegalArgumentException("S3 Bucket " + bucket + " not found!");; ArrayList<String> succ = new ArrayList<String>(); ArrayList<String> fail = new ArrayList<String>(); ObjectListing currentList = s3.listObjects(bucket, prefix); while(true){ for(S3ObjectSummary obj:currentList.getObjectSummaries()) try { succ.add(S3FileVec.make(obj,fs).toString()); } catch( Throwable e ) { fail.add(obj.getKey()); Log.err("Failed to loadfile from S3: path = " + obj.getKey() + ", error = " + e.getClass().getName() + ", msg = " + e.getMessage()); } if(currentList.isTruncated()) currentList = s3.listNextBatchOfObjects(currentList); else break; } keys = succ.toArray(new String[succ.size()]); files = keys; fails = fail.toArray(new String[fail.size()]); this.prefix = getCommonPrefix(keys); } private void serveLocalDisk() { File f = new File(path); if(!f.exists())throw new IllegalArgumentException("File " + path + " does not exist!"); ArrayList<String> afiles = new ArrayList(); ArrayList<String> akeys = new ArrayList(); ArrayList<String> afails = new ArrayList(); ArrayList<String> adels = new ArrayList(); FileIntegrityChecker.check(f).syncDirectory(afiles,akeys,afails,adels); files = afiles.toArray(new String[0]); keys = akeys .toArray(new String[0]); fails = afails.toArray(new String[0]); dels = adels .toArray(new String[0]); prefix = getCommonPrefix(keys); } protected void serveHttp() { try { java.net.URL url = new URL(path); Key k = Key.make(path); InputStream is = url.openStream(); if( is == null ) { Log.err("Unable to open stream to URL " + path); } UploadFileVec.readPut(k, is); fails = new String[0]; String[] filesArr = { path }; files = filesArr; String[] keysArr = { k.toString() }; keys = keysArr; this.prefix = getCommonPrefix(keys); } catch( Throwable e) { String[] arr = { path }; fails = arr; files = new String[0]; keys = new String[0]; } } private void serveTachyon() { assert path.startsWith(PersistTachyon.PREFIX) : "Path "+path+" is not prefixed by tachyon prefix " + PersistTachyon.PREFIX; TachyonFS client = null; ArrayList<String> succ = new ArrayList<String>(); ArrayList<String> fail = new ArrayList<String>(); try { String[] pathComponents = PersistTachyon.decode(path); String serverUri = pathComponents[0]; // Be explicit, it would be possible to use default client URI, but better is throw an error if (serverUri==null || serverUri.isEmpty()) throw new IllegalArgumentException("The " + path + " is not legall URI - it is missing tachyon server URI (e.g., tachyon://localhost:19998/)." ); client = ((PersistTachyon) Persist.I[Value.TACHYON]).createClient(PersistTachyon.PREFIX+serverUri); String rootFolder = pathComponents[1]; List<ClientFileInfo> filesOnTFS= client.listStatus(rootFolder); // do a recursive descend Futures fs = new Futures(); for (ClientFileInfo f : filesOnTFS ) { try { succ.add(TachyonFileVec.make(serverUri, f, fs).toString()); } catch (Throwable t) { fail.add(f.getName()); Log.err("Failed to loadfile from Tachyon: path = " + f.path + ", error = " + t.getClass().getName() + ", msg = " + t.getMessage()); } } keys = succ.toArray(new String[succ.size()]); files = keys; fails = fail.toArray(new String[fail.size()]); this.prefix = getCommonPrefix(keys); } catch (IOException e) { fillEmpty("Cannot access specified file(s) on tachyon FS, because " + e.getMessage()); } finally { if (client!=null) try { client.close(); } catch (TException _ ) {}; } } private void fillEmpty(String failure) { fails = new String[] {failure}; files = new String[0]; keys = new String[0]; } private String getCommonPrefix(String[] keys) { String prefix = new String(); if(keys.length > 0) prefix = keys[0]; for(int i = 1; i < keys.length; i++) { String tmp = keys[i]; int j = 0; for(; j < Math.min(prefix.length(), tmp.length()); j++) { if(prefix.charAt(j) != tmp.charAt(j)) break; } prefix = prefix.substring(0, j); } return prefix; } // HTML builder @Override public boolean toHTML( StringBuilder sb ) { if(files == null)return false; if( files != null && files.length > 1 ) sb.append("<div class='alert'>") .append(parseLink("*"+prefix+"*", "Parse all into hex format")) .append(" </div>"); DocGen.HTML.title(sb,"files"); DocGen.HTML.arrayHead(sb); for( int i=0; i<files.length; i++ ) sb.append("<tr><td><a href='"+parse()+"?source_key=").append(keys[i]). append("'>").append(files[i]).append("</a></td></tr>"); DocGen.HTML.arrayTail(sb); if( fails.length > 0 ) DocGen.HTML.array(DocGen.HTML.title(sb,"fails"),fails); if( dels != null && dels.length > 0 ) DocGen.HTML.array(DocGen.HTML.title(sb,"Keys deleted before importing"),dels); return true; } private boolean isBareS3NBucketWithoutTrailingSlash(String s) { Pattern p = Pattern.compile("s3n://[^/]*"); Matcher m = p.matcher(s); boolean b = m.matches(); return b; } }