/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *  */ package com.ibm.bi.dml.runtime.util; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapred.JobConf; import com.ibm.bi.dml.conf.ConfigurationManager; import com.ibm.bi.dml.parser.DataExpression; import com.ibm.bi.dml.parser.Expression.ValueType; import com.ibm.bi.dml.runtime.DMLRuntimeException; import com.ibm.bi.dml.runtime.io.MatrixReader; import com.ibm.bi.dml.runtime.io.MatrixReaderFactory; import com.ibm.bi.dml.runtime.matrix.MatrixCharacteristics; import com.ibm.bi.dml.runtime.matrix.data.CSVFileFormatProperties; import com.ibm.bi.dml.runtime.matrix.data.FileFormatProperties; import com.ibm.bi.dml.runtime.matrix.data.InputInfo; import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock; import com.ibm.bi.dml.runtime.matrix.data.NumItemsByEachReducerMetaData; import com.ibm.bi.dml.runtime.matrix.data.OutputInfo; import com.ibm.bi.dml.runtime.matrix.sort.ReadWithZeros; public class MapReduceTool { private static final Log LOG = LogFactory.getLog(MapReduceTool.class.getName()); private static JobConf _rJob = null; //cached job conf for read-only operations static{ _rJob = ConfigurationManager.getCachedJobConf(); } public static String getUniqueKeyPerTask(JobConf job, boolean inMapper) { //TODO: investigate ID pattern, required for parallel jobs /*String nodePrefix = job.get("mapred.task.id"); return String.valueOf(IDHandler.extractLongID(nodePrefix));*/ String nodePrefix = job.get("mapred.task.id"); int i; if (inMapper) i = nodePrefix.indexOf("_m_"); else i = nodePrefix.indexOf("_r_"); int j = nodePrefix.lastIndexOf("_"); nodePrefix = nodePrefix.substring(i + 3, j); // remove all the leading 0s return String.valueOf(Long.parseLong(nodePrefix)); } @Deprecated public static String getUniqueKeyPerTaskWithLeadingZros(JobConf job, boolean inMapper) { String nodePrefix = job.get("mapred.task.id"); int i; if (inMapper) i = nodePrefix.indexOf("_m_"); else i = nodePrefix.indexOf("_r_"); int j = nodePrefix.lastIndexOf("_"); nodePrefix = nodePrefix.substring(i + 3, j); return nodePrefix; } public static int getUniqueTaskId(JobConf job) { //TODO: investigate ID pattern, required for parallel jobs /*String nodePrefix = job.get("mapred.task.id"); return IDHandler.extractIntID(nodePrefix);*/ String nodePrefix = job.get("mapred.task.id"); int j = nodePrefix.lastIndexOf("_"); int i=nodePrefix.lastIndexOf("_", j-1); nodePrefix = nodePrefix.substring(i+1, j); // System.out.println("nodePrefix = " + nodePrefix) ; return Integer.valueOf(nodePrefix); } public static String getGloballyUniqueName(JobConf job) { return job.get("mapred.task.id"); } public static boolean existsFileOnHDFS(String fname){ boolean ret = true; try{ Path outpath = new Path(fname); ret = FileSystem.get(_rJob).exists(outpath); } catch(Exception ex) { LOG.error("Exception caught in existsFileOnHDFS", ex); ret = false; } return ret; } public static void deleteFileIfExistOnHDFS(Path outpath, JobConf job) throws IOException { if (FileSystem.get(job).exists(outpath)) { FileSystem.get(job).delete(outpath, true); } } public static void deleteFileIfExistOnLFS(Path outpath, JobConf job) throws IOException { if (FileSystem.getLocal(job).exists(outpath)) { FileSystem.getLocal(job).delete(outpath, true); } } public static void deleteFileIfExistOnHDFS(String dir) throws IOException { Path outpath = new Path(dir); FileSystem fs = FileSystem.get(_rJob); if (fs.exists(outpath)) { //System.err.println("Deleting " + outpath + " ... "); fs.delete(outpath, true); } } public static boolean isHDFSDirectory(String dir) throws IOException { FileSystem fs = FileSystem.get(_rJob); Path pth = new Path(dir); FileStatus fstat = fs.getFileStatus(pth); return fstat.isDirectory(); } public static boolean isHDFSFileEmpty(String dir) throws IOException { FileSystem fs = FileSystem.get(_rJob); return isFileEmpty(fs, dir); } public static boolean isFileEmpty(FileSystem fs, String dir) throws IOException { Path pth = new Path(dir); FileStatus fstat = fs.getFileStatus(pth); if (fstat.isDirectory()) { // it is a directory FileStatus[] stats = fs.listStatus(pth); if (stats != null) { for (FileStatus stat : stats) { if (stat.getLen() > 0) return false; } return true; } else { return true; } } else { // it is a regular file if (fstat.getLen() == 0) return true; else return false; } } public static void renameFileOnHDFS(String originalDir, String newDir) throws IOException { Path originalpath = new Path(originalDir); deleteFileIfExistOnHDFS(newDir); Path newpath = new Path(newDir); FileSystem fs = FileSystem.get(_rJob); if (fs.exists(originalpath)) { fs.rename(originalpath, newpath); } else { throw new FileNotFoundException(originalDir); } } public static void mergeIntoSingleFile(String originalDir, String newFile) throws IOException { FileSystem fs = FileSystem.get(_rJob); FileUtil.copyMerge(fs, new Path(originalDir), fs, new Path(newFile), true, _rJob, null); } public static void copyFileOnHDFS(String originalDir, String newDir) throws IOException { Path originalPath = new Path(originalDir); Path newPath = new Path(newDir); boolean deleteSource = false; boolean overwrite = true; JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); if (fs.exists(originalPath)) { FileUtil.copy(fs, originalPath, fs, newPath, deleteSource, overwrite, job); } } /** * * @param dir * @return * @throws IOException */ public static String getSubDirs(String dir) throws IOException { FileSystem fs = FileSystem.get(_rJob); FileStatus[] files = fs.listStatus(new Path(dir)); StringBuilder sb = new StringBuilder(); for (FileStatus file : files) { if ( sb.length()>0 ) sb.append(","); sb.append(file.getPath().toString()); } return sb.toString(); } /** * * @param dir * @return * @throws IOException */ public static String getSubDirsIgnoreLogs(String dir) throws IOException { FileSystem fs = FileSystem.get(_rJob); FileStatus[] files = fs.listStatus(new Path(dir)); StringBuilder sb = new StringBuilder(); for (FileStatus file : files) { String name = file.getPath().toString(); if (name.contains("_logs")) continue; if( sb.length()>0 ) sb.append(","); sb.append(name); } return sb.toString(); } /** * Returns the size of a file or directory on hdfs in bytes. * * @param path * @return * @throws IOException */ public static long getFilesizeOnHDFS( Path path ) throws IOException { FileSystem fs = FileSystem.get(_rJob); long ret = 0; //in bytes if( fs.isDirectory(path) ) ret = fs.getContentSummary(path).getLength(); else ret = fs.getFileStatus(path).getLen(); //note: filestatus would return 0 on directories return ret; } private static BufferedReader setupInputFile ( String filename ) throws IOException { Path pt=new Path(filename); FileSystem fs = FileSystem.get(_rJob); BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(pt))); return br; } public static double readDoubleFromHDFSFile(String filename) throws IOException { BufferedReader br = setupInputFile(filename); String line = br.readLine(); br.close(); if( line == null ) throw new IOException("Empty file on hdfs: "+filename); return Double.parseDouble(line); } public static long readIntegerFromHDFSFile(String filename) throws IOException { BufferedReader br = setupInputFile(filename); String line = br.readLine(); br.close(); if( line == null ) throw new IOException("Empty file on hdfs: "+filename); return Long.parseLong(line); } public static boolean readBooleanFromHDFSFile(String filename) throws IOException { BufferedReader br = setupInputFile(filename); String line = br.readLine(); br.close(); if( line == null ) throw new IOException("Empty file on hdfs: "+filename); return Boolean.parseBoolean(line); } public static String readStringFromHDFSFile(String filename) throws IOException { BufferedReader br = setupInputFile(filename); // handle multi-line strings in the HDFS file StringBuilder sb = new StringBuilder(); String line = null; while ( (line = br.readLine()) != null ) { sb.append(line); sb.append("\n"); } br.close(); //return string without last character return sb.substring(0, sb.length()-1); } private static BufferedWriter setupOutputFile ( String filename ) throws IOException { Path pt=new Path(filename); FileSystem fs = FileSystem.get(_rJob); BufferedWriter br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))); return br; } public static void writeDoubleToHDFS ( double d, String filename ) throws IOException { BufferedWriter br = setupOutputFile(filename); String line = "" + d; br.write(line); br.close(); } public static void writeIntToHDFS ( long i, String filename ) throws IOException { BufferedWriter br = setupOutputFile(filename); String line = "" + i; br.write(line); br.close(); } public static void writeBooleanToHDFS ( boolean b, String filename ) throws IOException { BufferedWriter br = setupOutputFile(filename); String line = "" + b; br.write(line); br.close(); } public static void writeStringToHDFS ( String s, String filename ) throws IOException { BufferedWriter br = setupOutputFile(filename); String line = "" + s; br.write(line); br.close(); } public static void writeDimsFile ( String filename, byte[] unknownFlags, long[] maxRows, long[] maxCols) throws IOException { BufferedWriter br = setupOutputFile(filename); StringBuilder line = new StringBuilder(); for ( int i=0; i < unknownFlags.length; i++ ) { if ( unknownFlags[i] != (byte)0 ) { line.append(i); line.append(" " + maxRows[i]); line.append(" " + maxCols[i]); line.append("\n"); } } br.write(line.toString()); br.close(); //System.out.println("Finished writing dimsFile: " + filename); } public static MatrixCharacteristics[] processDimsFiles(String dir, MatrixCharacteristics[] stats) throws IOException { Path pt=new Path(dir); FileSystem fs = FileSystem.get(_rJob); if ( !fs.exists(pt) ) return stats; FileStatus fstat = fs.getFileStatus(pt); if ( fstat.isDirectory() ) { FileStatus[] files = fs.listStatus(pt); for ( int i=0; i < files.length; i++ ) { Path filePath = files[i].getPath(); //System.out.println("Processing dims file: " + filePath.toString()); BufferedReader br = setupInputFile(filePath.toString()); String line = ""; while((line=br.readLine()) != null ) { String[] parts = line.split(" "); int resultIndex = Integer.parseInt(parts[0]); long maxRows = Long.parseLong(parts[1]); long maxCols = Long.parseLong(parts[2]); stats[resultIndex].setDimension( (stats[resultIndex].getRows() < maxRows ? maxRows : stats[resultIndex].getRows()), (stats[resultIndex].getCols() < maxCols ? maxCols : stats[resultIndex].getCols()) ); } br.close(); } } else { throw new IOException(dir + " is expected to be a folder!"); } return stats; } public static void writeMetaDataFile ( String mtdfile, ValueType v, MatrixCharacteristics mc, OutputInfo outinfo) throws IOException { writeMetaDataFile(mtdfile, v, mc, outinfo, null); } public static void writeMetaDataFile( String mtdfile, ValueType v, MatrixCharacteristics mc, OutputInfo outinfo, FileFormatProperties formatProperties ) throws IOException { Path pt = new Path(mtdfile); FileSystem fs = FileSystem.get(_rJob); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))); formatProperties = (formatProperties==null && outinfo==OutputInfo.CSVOutputInfo) ? new CSVFileFormatProperties() : formatProperties; String line = ""; try { line += "{ \n" + " \"" + DataExpression.DATATYPEPARAM + "\": \"matrix\"\n" + " ,\"" + DataExpression.VALUETYPEPARAM + "\": "; switch (v) { case DOUBLE: line += "\"double\"\n"; break; case INT: line += "\"int\"\n"; break; case BOOLEAN: line += "\"boolean\"\n"; break; case STRING: line += "\"string\"\n"; break; case UNKNOWN: line += "\"unknown\"\n"; break; case OBJECT: line += "\"object\"\n"; break; }; line += " ,\"" + DataExpression.READROWPARAM + "\": " + mc.getRows() + "\n" + " ,\"" + DataExpression.READCOLPARAM + "\": " + mc.getCols() + "\n"; // only output rows_in_block and cols_in_block for binary format if ( outinfo == OutputInfo.BinaryBlockOutputInfo) { line += " ,\"" + DataExpression.ROWBLOCKCOUNTPARAM + "\": " + mc.getRowsPerBlock() + "\n" + " ,\"" + DataExpression.COLUMNBLOCKCOUNTPARAM + "\": " + mc.getColsPerBlock() + "\n"; } line += " ,\"" + DataExpression.READNUMNONZEROPARAM + "\": " + mc.getNonZeros() + "\n" + " ,\"" + DataExpression.FORMAT_TYPE + "\": "; if ( outinfo == OutputInfo.TextCellOutputInfo ) { line += "\"text\"\n"; } else if (outinfo == OutputInfo.BinaryBlockOutputInfo || outinfo == OutputInfo.BinaryCellOutputInfo ) { line += "\"binary\"\n"; // currently, there is no way to differentiate between them } else if (outinfo == OutputInfo.CSVOutputInfo ) { line += "\"csv\"\n"; } else { line += "\"specialized\"\n"; } if ( outinfo == OutputInfo.CSVOutputInfo) { CSVFileFormatProperties csvProperties = (CSVFileFormatProperties) formatProperties; line += " ,\"" + DataExpression.DELIM_HAS_HEADER_ROW + "\": " + csvProperties.hasHeader() + "\n"; line += " ,\"" + DataExpression.DELIM_DELIMITER + "\": \"" + csvProperties.getDelim() + "\"\n"; } line += " ,\"description\": { \"author\": \"SystemML\" } \n" + "}" ; br.write(line); br.close(); }catch (Exception e) { throw new IOException(e); } } public static void writeScalarMetaDataFile ( String mtdfile, ValueType v ) throws IOException { Path pt=new Path(mtdfile); FileSystem fs = FileSystem.get(_rJob); BufferedWriter br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))); try { String line = ""; line += "{ \n" + " \"" + DataExpression.DATATYPEPARAM + "\": \"scalar\"\n" + " ,\"" + DataExpression.VALUETYPEPARAM + "\": "; switch (v) { case DOUBLE: line += "\"double\"\n"; break; case INT: line += "\"int\"\n"; break; case BOOLEAN: line += "\"boolean\"\n"; break; case STRING: line += "\"string\"\n"; break; case UNKNOWN: line += "\"unknown\"\n"; break; case OBJECT: throw new IOException("Write of generic object types not supported."); }; line += " ,\"" + DataExpression.FORMAT_TYPE + "\": \"text\"\n" + " ,\"description\": { \"author\": \"SystemML\" } \n" +" }" ; br.write(line); br.close(); }catch (Exception e) { throw new IOException(e); } } public static double[][] readMatrixFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { MatrixReader reader = MatrixReaderFactory.createMatrixReader(inputinfo); MatrixBlock mb = reader.readMatrixFromHDFS(dir, rlen, clen, brlen, bclen, rlen*clen); return DataConverter.convertToDoubleMatrix(mb); } public static double[] readColumnVectorFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { MatrixReader reader = MatrixReaderFactory.createMatrixReader(inputinfo); MatrixBlock mb = reader.readMatrixFromHDFS(dir, rlen, clen, brlen, bclen, rlen*clen); return DataConverter.convertToDoubleVector(mb); } public static double median(String dir, NumItemsByEachReducerMetaData metadata) throws IOException { long[] counts=metadata.getNumItemsArray(); long[] ranges=new long[counts.length]; ranges[0]=counts[0]; for(int i=1; i<counts.length; i++) ranges[i]=ranges[i-1]+counts[i]; long total=ranges[ranges.length-1]; return pickValueWeight(dir, metadata, 0.5, total%2==0)[0]; } public static double pickValue(String dir, NumItemsByEachReducerMetaData metadata, double p) throws IOException { return pickValueWeight(dir, metadata, p, false)[0]; } public static double[] pickValueWeight(String dir, NumItemsByEachReducerMetaData metadata, double p, boolean average) throws IOException { long[] counts=metadata.getNumItemsArray(); long[] ranges=new long[counts.length]; ranges[0]=counts[0]; for(int i=1; i<counts.length; i++) ranges[i]=ranges[i-1]+counts[i]; long total=ranges[ranges.length-1]; // do averaging only if it is asked for; and sum_wt is even average = average && (total%2 == 0); int currentPart=0; double cum_weight = 0; long pos=(long)Math.ceil(total*p); while(ranges[currentPart]<pos) { currentPart++; cum_weight += ranges[currentPart]; } int offset; if(currentPart>0) offset=(int)(pos-ranges[currentPart-1]-1); else offset=(int)pos-1; FileSystem fs=FileSystem.get(_rJob); Path path=new Path(dir); FileStatus[] files=fs.listStatus(path); Path fileToRead=null; for(FileStatus file: files) if(file.getPath().toString().endsWith(Integer.toString(currentPart))) { fileToRead=file.getPath(); break; } if(fileToRead==null) throw new RuntimeException("cannot read partition "+currentPart); FSDataInputStream currentStream=fs.open(fileToRead); DoubleWritable readKey=new DoubleWritable(); IntWritable readValue=new IntWritable(); boolean contain0s=false; long numZeros=0; if(currentPart==metadata.getPartitionOfZero()) { contain0s=true; numZeros=metadata.getNumberOfZero(); } ReadWithZeros reader=new ReadWithZeros(currentStream, contain0s, numZeros); int numRead=0; while(numRead<=offset) { reader.readNextKeyValuePairs(readKey, readValue); numRead+=readValue.get(); cum_weight += readValue.get(); } double ret = readKey.get(); if(average) { if(numRead<=offset+1) { reader.readNextKeyValuePairs(readKey, readValue); cum_weight += readValue.get(); ret = (ret+readKey.get())/2; } } currentStream.close(); return new double[] {ret, (average ? -1 : readValue.get()), (average ? -1 : cum_weight)}; } /** * * @param name * @return */ public static int extractNumberFromOutputFile(String name) { int i=name.indexOf("part-"); assert(i>=0); return Integer.parseInt(name.substring(i+5)); } /** * * @param dir * @param permissions * @throws IOException */ public static void createDirIfNotExistOnHDFS(String dir, String permissions) throws IOException { Path path = new Path(dir); try { FileSystem fs = FileSystem.get(_rJob); if( !fs.exists(path) ) { char[] c = permissions.toCharArray(); short sU = (short)((c[0]-48) * 64); short sG = (short)((c[1]-48) * 8); short sO = (short)((c[2]-48)); short mode = (short)(sU + sG + sO); FsPermission perm = new FsPermission(mode); fs.mkdirs(path, perm); } } catch (Exception ex){ throw new IOException("Failed in creating a non existing dir on HDFS", ex); } //NOTE: we depend on the configured umask, setting umask in job or fspermission has no effect //similarly setting dfs.datanode.data.dir.perm as no effect either. } /** * * @param filename * @param overwrite * @return * @throws IOException */ public static FSDataOutputStream getHDFSDataOutputStream(String filename, boolean overwrite) throws IOException { FileSystem fs = FileSystem.get(_rJob); Path path = new Path(filename); return fs.create(path, overwrite); } }