/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.util; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.text.SimpleDateFormat; import java.util.Date; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang3.StringEscapeUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.io.DoubleWritable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapred.JobConf; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.parser.DataExpression; import org.apache.sysml.parser.Expression.DataType; import org.apache.sysml.parser.Expression.ValueType; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.io.IOUtilFunctions; import org.apache.sysml.runtime.io.MatrixReader; import org.apache.sysml.runtime.io.MatrixReaderFactory; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties; import org.apache.sysml.runtime.matrix.data.FileFormatProperties; import org.apache.sysml.runtime.matrix.data.InputInfo; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.NumItemsByEachReducerMetaData; import org.apache.sysml.runtime.matrix.data.OutputInfo; import org.apache.sysml.runtime.matrix.mapred.MRConfigurationNames; import org.apache.sysml.runtime.matrix.sort.ReadWithZeros; import org.apache.wink.json4j.JSONException; import org.apache.wink.json4j.OrderedJSONObject; public class MapReduceTool { private static final int MAX_DELETE_RETRIES = 10; private static final Log LOG = LogFactory.getLog(MapReduceTool.class.getName()); private static JobConf _rJob = null; //cached job conf for read-only operations static{ _rJob = ConfigurationManager.getCachedJobConf(); } public static String getUniqueKeyPerTask(JobConf job, boolean inMapper) { //TODO: investigate ID pattern, required for parallel jobs /*String nodePrefix = job.get(MRConfigurationNames.MR_TASK_ATTEMPT_ID); return String.valueOf(IDHandler.extractLongID(nodePrefix));*/ String nodePrefix = job.get(MRConfigurationNames.MR_TASK_ATTEMPT_ID); int i; if (inMapper) i = nodePrefix.indexOf("_m_"); else i = nodePrefix.indexOf("_r_"); int j = nodePrefix.lastIndexOf("_"); nodePrefix = nodePrefix.substring(i + 3, j); // remove all the leading 0s return String.valueOf(Long.parseLong(nodePrefix)); } public static int getUniqueTaskId(JobConf job) { //TODO: investigate ID pattern, required for parallel jobs /*String nodePrefix = job.get(MRConfigurationNames.MR_TASK_ATTEMPT_ID); return IDHandler.extractIntID(nodePrefix);*/ String nodePrefix = job.get(MRConfigurationNames.MR_TASK_ATTEMPT_ID); int j = nodePrefix.lastIndexOf("_"); int i=nodePrefix.lastIndexOf("_", j-1); nodePrefix = nodePrefix.substring(i+1, j); // System.out.println("nodePrefix = " + nodePrefix) ; return Integer.valueOf(nodePrefix); } public static String getGloballyUniqueName(JobConf job) { return job.get(MRConfigurationNames.MR_TASK_ATTEMPT_ID); } public static boolean existsFileOnHDFS(String fname){ try { return FileSystem.get(_rJob) .exists(new Path(fname)); } catch(Exception ex) { LOG.error("Failed check existsFileOnHDFS.", ex); } return false; } public static boolean isDirectory(String fname) { try { return FileSystem.get(_rJob) .isDirectory(new Path(fname)); } catch(Exception ex) { LOG.error("Failed check isDirectory.", ex); } return false; } public static FileStatus[] getDirectoryListing(String fname) { try { return FileSystem.get(_rJob) .listStatus(new Path(fname)); } catch(Exception ex) { LOG.error("Failed listing of directory contents.", ex); } return new FileStatus[0]; } public static void deleteFileWithMTDIfExistOnHDFS(String fname) throws IOException { deleteFileIfExistOnHDFS(fname); deleteFileIfExistOnHDFS(fname + ".mtd"); } public static void deleteFileIfExistOnHDFS(String dir) throws IOException { deleteFileIfExists(FileSystem.get(_rJob), new Path(dir)); } public static void deleteFileIfExistOnHDFS(Path outpath, JobConf job) throws IOException { deleteFileIfExists(FileSystem.get(job), outpath); } public static void deleteFileIfExistOnLFS(Path outpath, JobConf job) throws IOException { deleteFileIfExists(FileSystem.getLocal(job), outpath); } private static void deleteFileIfExists(FileSystem fs, Path outpath) throws IOException { if( fs.exists(outpath) ) { int retries = MAX_DELETE_RETRIES; while( !fs.delete(outpath, true) && retries > 0 ) { retries--; } } } public static boolean isHDFSFileEmpty(String dir) throws IOException { FileSystem fs = FileSystem.get(_rJob); return isFileEmpty(fs, dir); } public static boolean isFileEmpty(FileSystem fs, String dir) throws IOException { Path pth = new Path(dir); FileStatus fstat = fs.getFileStatus(pth); if (fstat.isDirectory()) { // it is a directory FileStatus[] stats = fs.listStatus(pth); if (stats != null) { for (FileStatus stat : stats) { if (stat.getLen() > 0) return false; } return true; } else { return true; } } else { // it is a regular file if (fstat.getLen() == 0) return true; else return false; } } public static void renameFileOnHDFS(String originalDir, String newDir) throws IOException { Path originalpath = new Path(originalDir); deleteFileIfExistOnHDFS(newDir); Path newpath = new Path(newDir); FileSystem fs = FileSystem.get(_rJob); if (fs.exists(originalpath)) { fs.rename(originalpath, newpath); } else { throw new FileNotFoundException(originalDir); } } public static void mergeIntoSingleFile(String originalDir, String newFile) throws IOException { FileSystem fs = FileSystem.get(_rJob); FileUtil.copyMerge(fs, new Path(originalDir), fs, new Path(newFile), true, _rJob, null); } public static void copyFileOnHDFS(String originalDir, String newDir) throws IOException { Path originalPath = new Path(originalDir); Path newPath = new Path(newDir); boolean deleteSource = false; boolean overwrite = true; JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); if (fs.exists(originalPath)) { FileUtil.copy(fs, originalPath, fs, newPath, deleteSource, overwrite, job); } } /** * Returns the size of a file or directory on hdfs in bytes. * * @param path file system path * @return file size * @throws IOException if IOException occurs */ public static long getFilesizeOnHDFS( Path path ) throws IOException { FileSystem fs = FileSystem.get(_rJob); long ret = 0; //in bytes if( fs.isDirectory(path) ) ret = fs.getContentSummary(path).getLength(); else ret = fs.getFileStatus(path).getLen(); //note: filestatus would return 0 on directories return ret; } private static BufferedReader setupInputFile ( String filename ) throws IOException { Path pt=new Path(filename); FileSystem fs = FileSystem.get(_rJob); BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(pt))); return br; } public static double readDoubleFromHDFSFile(String filename) throws IOException { return (Double)readObjectFromHDFSFile(filename, ValueType.DOUBLE); } public static long readIntegerFromHDFSFile(String filename) throws IOException { return (Long)readObjectFromHDFSFile(filename, ValueType.INT); } public static boolean readBooleanFromHDFSFile(String filename) throws IOException { return (Boolean)readObjectFromHDFSFile(filename, ValueType.BOOLEAN); } public static String readStringFromHDFSFile(String filename) throws IOException { StringBuilder sb = new StringBuilder(); try( BufferedReader br = setupInputFile(filename) ) { // handle multi-line strings in the HDFS file String line = null; while ( (line = br.readLine()) != null ) { sb.append(line); sb.append("\n"); } } //return string without last character return sb.substring(0, sb.length()-1); } public static Object readObjectFromHDFSFile(String filename, ValueType vt) throws IOException { String line = null; try( BufferedReader br = setupInputFile(filename) ) { line = br.readLine(); } if( line == null ) throw new IOException("Empty file on hdfs: "+filename); switch( vt ) { case BOOLEAN: return Boolean.parseBoolean(line); case DOUBLE: return Double.parseDouble(line); case INT: return Long.parseLong(line); default: return line; } } private static BufferedWriter setupOutputFile ( String filename ) throws IOException { Path pt=new Path(filename); FileSystem fs = FileSystem.get(_rJob); BufferedWriter br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))); return br; } public static void writeDoubleToHDFS ( double d, String filename ) throws IOException { writeObjectToHDFS(d, filename); } public static void writeIntToHDFS ( long i, String filename ) throws IOException { writeObjectToHDFS(i, filename); } public static void writeBooleanToHDFS ( boolean b, String filename ) throws IOException { writeObjectToHDFS(b, filename); } public static void writeStringToHDFS ( String s, String filename ) throws IOException { writeObjectToHDFS(s, filename); } public static void writeObjectToHDFS ( Object obj, String filename ) throws IOException { try( BufferedWriter br = setupOutputFile(filename) ) { br.write(obj.toString()); } } public static void writeDimsFile ( String filename, byte[] unknownFlags, long[] maxRows, long[] maxCols) throws IOException { try( BufferedWriter br = setupOutputFile(filename) ) { StringBuilder line = new StringBuilder(); for ( int i=0; i < unknownFlags.length; i++ ) { if ( unknownFlags[i] != (byte)0 ) { line.append(i); line.append(" " + maxRows[i]); line.append(" " + maxCols[i]); line.append("\n"); } } br.write(line.toString()); } } public static MatrixCharacteristics[] processDimsFiles(String dir, MatrixCharacteristics[] stats) throws IOException { Path pt=new Path(dir); FileSystem fs = FileSystem.get(_rJob); if ( !fs.exists(pt) ) return stats; FileStatus fstat = fs.getFileStatus(pt); if ( fstat.isDirectory() ) { FileStatus[] files = fs.listStatus(pt); for ( int i=0; i < files.length; i++ ) { Path filePath = files[i].getPath(); try( BufferedReader br = setupInputFile(filePath.toString()) ) { String line = ""; while((line=br.readLine()) != null ) { String[] parts = line.split(" "); int resultIndex = Integer.parseInt(parts[0]); long maxRows = Long.parseLong(parts[1]); long maxCols = Long.parseLong(parts[2]); stats[resultIndex].setDimension( (stats[resultIndex].getRows() < maxRows ? maxRows : stats[resultIndex].getRows()), (stats[resultIndex].getCols() < maxCols ? maxCols : stats[resultIndex].getCols()) ); } } } } else { throw new IOException(dir + " is expected to be a folder!"); } return stats; } public static void writeMetaDataFile(String mtdfile, ValueType vt, MatrixCharacteristics mc, OutputInfo outinfo) throws IOException { writeMetaDataFile(mtdfile, vt, null, DataType.MATRIX, mc, outinfo); } public static void writeMetaDataFile(String mtdfile, ValueType vt, ValueType[] schema, DataType dt, MatrixCharacteristics mc, OutputInfo outinfo) throws IOException { writeMetaDataFile(mtdfile, vt, schema, dt, mc, outinfo, null); } public static void writeMetaDataFile(String mtdfile, ValueType vt, MatrixCharacteristics mc, OutputInfo outinfo, FileFormatProperties formatProperties) throws IOException { writeMetaDataFile(mtdfile, vt, null, DataType.MATRIX, mc, outinfo, formatProperties); } public static void writeMetaDataFile(String mtdfile, ValueType vt, ValueType[] schema, DataType dt, MatrixCharacteristics mc, OutputInfo outinfo, FileFormatProperties formatProperties) throws IOException { Path pt = new Path(mtdfile); FileSystem fs = FileSystem.get(_rJob); try( BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))) ) { String mtd = metaDataToString(vt, schema, dt, mc, outinfo, formatProperties); br.write(mtd); } catch (Exception e) { throw new IOException("Error creating and writing metadata JSON file", e); } } public static void writeScalarMetaDataFile(String mtdfile, ValueType vt) throws IOException { Path pt = new Path(mtdfile); FileSystem fs = FileSystem.get(_rJob); try( BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))) ) { String mtd = metaDataToString(vt, null, DataType.SCALAR, null, OutputInfo.TextCellOutputInfo, null); br.write(mtd); } catch (Exception e) { throw new IOException("Error creating and writing metadata JSON file", e); } } public static String metaDataToString(ValueType vt, ValueType[] schema, DataType dt, MatrixCharacteristics mc, OutputInfo outinfo, FileFormatProperties formatProperties) throws JSONException, DMLRuntimeException { OrderedJSONObject mtd = new OrderedJSONObject(); // maintain order in output file //handle data type and value types (incl schema for frames) mtd.put(DataExpression.DATATYPEPARAM, dt.toString().toLowerCase()); if (schema == null) { mtd.put(DataExpression.VALUETYPEPARAM, vt.toString().toLowerCase()); } else { StringBuffer schemaSB = new StringBuffer(); for(int i=0; i < schema.length; i++) { if( schema[i] == ValueType.UNKNOWN ) schemaSB.append("*"); else schemaSB.append(schema[i].toString()); schemaSB.append(DataExpression.DEFAULT_DELIM_DELIMITER); } mtd.put(DataExpression.SCHEMAPARAM, schemaSB.toString()); } //handle output dimensions if( !dt.isScalar() ) { mtd.put(DataExpression.READROWPARAM, mc.getRows()); mtd.put(DataExpression.READCOLPARAM, mc.getCols()); // handle output nnz and binary block configuration if( dt.isMatrix() ) { if (outinfo == OutputInfo.BinaryBlockOutputInfo ) { mtd.put(DataExpression.ROWBLOCKCOUNTPARAM, mc.getRowsPerBlock()); mtd.put(DataExpression.COLUMNBLOCKCOUNTPARAM, mc.getColsPerBlock()); } mtd.put(DataExpression.READNUMNONZEROPARAM, mc.getNonZeros()); } } //handle format type and additional arguments mtd.put(DataExpression.FORMAT_TYPE, OutputInfo.outputInfoToStringExternal(outinfo)); if (outinfo == OutputInfo.CSVOutputInfo) { CSVFileFormatProperties csvProperties = (formatProperties==null) ? new CSVFileFormatProperties() : (CSVFileFormatProperties)formatProperties; mtd.put(DataExpression.DELIM_HAS_HEADER_ROW, csvProperties.hasHeader()); mtd.put(DataExpression.DELIM_DELIMITER, csvProperties.getDelim()); } if (formatProperties != null) { String description = formatProperties.getDescription(); if (StringUtils.isNotEmpty(description)) { String jsonDescription = StringEscapeUtils.escapeJson(description); mtd.put(DataExpression.DESCRIPTIONPARAM, jsonDescription); } } String userName = System.getProperty("user.name"); if (StringUtils.isNotEmpty(userName)) { mtd.put(DataExpression.AUTHORPARAM, userName); } else { mtd.put(DataExpression.AUTHORPARAM, "SystemML"); } SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss z"); mtd.put(DataExpression.CREATEDPARAM, sdf.format(new Date())); return mtd.toString(4); // indent with 4 spaces } public static double[][] readMatrixFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { MatrixReader reader = MatrixReaderFactory.createMatrixReader(inputinfo); MatrixBlock mb = reader.readMatrixFromHDFS(dir, rlen, clen, brlen, bclen, rlen*clen); return DataConverter.convertToDoubleMatrix(mb); } public static double[] readColumnVectorFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException { MatrixReader reader = MatrixReaderFactory.createMatrixReader(inputinfo); MatrixBlock mb = reader.readMatrixFromHDFS(dir, rlen, clen, brlen, bclen, rlen*clen); return DataConverter.convertToDoubleVector(mb); } public static double median(String dir, NumItemsByEachReducerMetaData metadata) throws IOException { long[] counts=metadata.getNumItemsArray(); long[] ranges=new long[counts.length]; ranges[0]=counts[0]; for(int i=1; i<counts.length; i++) ranges[i]=ranges[i-1]+counts[i]; long total=ranges[ranges.length-1]; return pickValueWeight(dir, metadata, 0.5, total%2==0)[0]; } public static double pickValue(String dir, NumItemsByEachReducerMetaData metadata, double p) throws IOException { return pickValueWeight(dir, metadata, p, false)[0]; } public static double[] pickValueWeight(String dir, NumItemsByEachReducerMetaData metadata, double p, boolean average) throws IOException { long[] counts=metadata.getNumItemsArray(); long[] ranges=new long[counts.length]; ranges[0]=counts[0]; for(int i=1; i<counts.length; i++) ranges[i]=ranges[i-1]+counts[i]; long total=ranges[ranges.length-1]; // do averaging only if it is asked for; and sum_wt is even average = average && (total%2 == 0); int currentPart=0; double cum_weight = 0; long pos=(long)Math.ceil(total*p); while(ranges[currentPart]<pos) { currentPart++; cum_weight += ranges[currentPart]; } int offset; if(currentPart>0) offset=(int)(pos-ranges[currentPart-1]-1); else offset=(int)pos-1; FileSystem fs=FileSystem.get(_rJob); Path path=new Path(dir); FileStatus[] files=fs.listStatus(path); Path fileToRead=null; for(FileStatus file: files) if(file.getPath().toString().endsWith(Integer.toString(currentPart))) { fileToRead=file.getPath(); break; } if(fileToRead==null) throw new RuntimeException("cannot read partition "+currentPart); int buffsz = 64 * 1024; DoubleWritable readKey=new DoubleWritable(); IntWritable readValue=new IntWritable(); FSDataInputStream currentStream = null; double ret = -1; try { currentStream = fs.open(fileToRead, buffsz); boolean contain0s=false; long numZeros=0; if(currentPart==metadata.getPartitionOfZero()) { contain0s=true; numZeros=metadata.getNumberOfZero(); } ReadWithZeros reader=new ReadWithZeros(currentStream, contain0s, numZeros); int numRead=0; while(numRead<=offset) { reader.readNextKeyValuePairs(readKey, readValue); numRead+=readValue.get(); cum_weight += readValue.get(); } ret = readKey.get(); if(average) { if(numRead<=offset+1) { reader.readNextKeyValuePairs(readKey, readValue); cum_weight += readValue.get(); ret = (ret+readKey.get())/2; } } } finally { IOUtilFunctions.closeSilently(currentStream); } return new double[] {ret, (average ? -1 : readValue.get()), (average ? -1 : cum_weight)}; } public static void createDirIfNotExistOnHDFS(String dir, String permissions) throws IOException { Path path = new Path(dir); try { FileSystem fs = FileSystem.get(_rJob); if( !fs.exists(path) ) { char[] c = permissions.toCharArray(); short sU = (short)((c[0]-48) * 64); short sG = (short)((c[1]-48) * 8); short sO = (short)((c[2]-48)); short mode = (short)(sU + sG + sO); FsPermission perm = new FsPermission(mode); fs.mkdirs(path, perm); } } catch (Exception ex){ throw new IOException("Failed in creating a non existing dir on HDFS", ex); } //NOTE: we depend on the configured umask, setting umask in job or fspermission has no effect //similarly setting MRConfigurationNames.DFS_DATANODE_DATA_DIR_PERM as no effect either. } public static FSDataOutputStream getHDFSDataOutputStream(String filename, boolean overwrite) throws IOException { FileSystem fs = FileSystem.get(_rJob); Path path = new Path(filename); return fs.create(path, overwrite); } }