/**
* (C) Copyright IBM Corp. 2010, 2015
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.ibm.bi.dml.runtime.util;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;
import com.ibm.bi.dml.conf.ConfigurationManager;
import com.ibm.bi.dml.parser.DataExpression;
import com.ibm.bi.dml.parser.Expression.ValueType;
import com.ibm.bi.dml.runtime.DMLRuntimeException;
import com.ibm.bi.dml.runtime.io.MatrixReader;
import com.ibm.bi.dml.runtime.io.MatrixReaderFactory;
import com.ibm.bi.dml.runtime.matrix.MatrixCharacteristics;
import com.ibm.bi.dml.runtime.matrix.data.CSVFileFormatProperties;
import com.ibm.bi.dml.runtime.matrix.data.FileFormatProperties;
import com.ibm.bi.dml.runtime.matrix.data.InputInfo;
import com.ibm.bi.dml.runtime.matrix.data.MatrixBlock;
import com.ibm.bi.dml.runtime.matrix.data.NumItemsByEachReducerMetaData;
import com.ibm.bi.dml.runtime.matrix.data.OutputInfo;
import com.ibm.bi.dml.runtime.matrix.sort.ReadWithZeros;
public class MapReduceTool
{
private static final Log LOG = LogFactory.getLog(MapReduceTool.class.getName());
private static JobConf _rJob = null; //cached job conf for read-only operations
static{
_rJob = ConfigurationManager.getCachedJobConf();
}
public static String getUniqueKeyPerTask(JobConf job, boolean inMapper) {
//TODO: investigate ID pattern, required for parallel jobs
/*String nodePrefix = job.get("mapred.task.id");
return String.valueOf(IDHandler.extractLongID(nodePrefix));*/
String nodePrefix = job.get("mapred.task.id");
int i;
if (inMapper)
i = nodePrefix.indexOf("_m_");
else
i = nodePrefix.indexOf("_r_");
int j = nodePrefix.lastIndexOf("_");
nodePrefix = nodePrefix.substring(i + 3, j);
// remove all the leading 0s
return String.valueOf(Long.parseLong(nodePrefix));
}
@Deprecated
public static String getUniqueKeyPerTaskWithLeadingZros(JobConf job, boolean inMapper) {
String nodePrefix = job.get("mapred.task.id");
int i;
if (inMapper)
i = nodePrefix.indexOf("_m_");
else
i = nodePrefix.indexOf("_r_");
int j = nodePrefix.lastIndexOf("_");
nodePrefix = nodePrefix.substring(i + 3, j);
return nodePrefix;
}
public static int getUniqueTaskId(JobConf job) {
//TODO: investigate ID pattern, required for parallel jobs
/*String nodePrefix = job.get("mapred.task.id");
return IDHandler.extractIntID(nodePrefix);*/
String nodePrefix = job.get("mapred.task.id");
int j = nodePrefix.lastIndexOf("_");
int i=nodePrefix.lastIndexOf("_", j-1);
nodePrefix = nodePrefix.substring(i+1, j);
// System.out.println("nodePrefix = " + nodePrefix) ;
return Integer.valueOf(nodePrefix);
}
public static String getGloballyUniqueName(JobConf job) {
return job.get("mapred.task.id");
}
public static boolean existsFileOnHDFS(String fname){
boolean ret = true;
try{
Path outpath = new Path(fname);
ret = FileSystem.get(_rJob).exists(outpath);
}
catch(Exception ex)
{
LOG.error("Exception caught in existsFileOnHDFS", ex);
ret = false;
}
return ret;
}
public static void deleteFileIfExistOnHDFS(Path outpath, JobConf job) throws IOException {
if (FileSystem.get(job).exists(outpath)) {
FileSystem.get(job).delete(outpath, true);
}
}
public static void deleteFileIfExistOnLFS(Path outpath, JobConf job) throws IOException {
if (FileSystem.getLocal(job).exists(outpath)) {
FileSystem.getLocal(job).delete(outpath, true);
}
}
public static void deleteFileIfExistOnHDFS(String dir) throws IOException {
Path outpath = new Path(dir);
FileSystem fs = FileSystem.get(_rJob);
if (fs.exists(outpath)) {
//System.err.println("Deleting " + outpath + " ... ");
fs.delete(outpath, true);
}
}
public static boolean isHDFSDirectory(String dir) throws IOException {
FileSystem fs = FileSystem.get(_rJob);
Path pth = new Path(dir);
FileStatus fstat = fs.getFileStatus(pth);
return fstat.isDirectory();
}
public static boolean isHDFSFileEmpty(String dir) throws IOException {
FileSystem fs = FileSystem.get(_rJob);
return isFileEmpty(fs, dir);
}
public static boolean isFileEmpty(FileSystem fs, String dir) throws IOException {
Path pth = new Path(dir);
FileStatus fstat = fs.getFileStatus(pth);
if (fstat.isDirectory()) {
// it is a directory
FileStatus[] stats = fs.listStatus(pth);
if (stats != null) {
for (FileStatus stat : stats) {
if (stat.getLen() > 0)
return false;
}
return true;
} else {
return true;
}
} else {
// it is a regular file
if (fstat.getLen() == 0)
return true;
else
return false;
}
}
public static void renameFileOnHDFS(String originalDir, String newDir) throws IOException {
Path originalpath = new Path(originalDir);
deleteFileIfExistOnHDFS(newDir);
Path newpath = new Path(newDir);
FileSystem fs = FileSystem.get(_rJob);
if (fs.exists(originalpath)) {
fs.rename(originalpath, newpath);
}
else {
throw new FileNotFoundException(originalDir);
}
}
public static void mergeIntoSingleFile(String originalDir, String newFile) throws IOException {
FileSystem fs = FileSystem.get(_rJob);
FileUtil.copyMerge(fs, new Path(originalDir), fs, new Path(newFile), true, _rJob, null);
}
public static void copyFileOnHDFS(String originalDir, String newDir) throws IOException {
Path originalPath = new Path(originalDir);
Path newPath = new Path(newDir);
boolean deleteSource = false;
boolean overwrite = true;
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
FileSystem fs = FileSystem.get(job);
if (fs.exists(originalPath)) {
FileUtil.copy(fs, originalPath, fs, newPath, deleteSource, overwrite, job);
}
}
/**
*
* @param dir
* @return
* @throws IOException
*/
public static String getSubDirs(String dir)
throws IOException
{
FileSystem fs = FileSystem.get(_rJob);
FileStatus[] files = fs.listStatus(new Path(dir));
StringBuilder sb = new StringBuilder();
for (FileStatus file : files) {
if ( sb.length()>0 )
sb.append(",");
sb.append(file.getPath().toString());
}
return sb.toString();
}
/**
*
* @param dir
* @return
* @throws IOException
*/
public static String getSubDirsIgnoreLogs(String dir)
throws IOException
{
FileSystem fs = FileSystem.get(_rJob);
FileStatus[] files = fs.listStatus(new Path(dir));
StringBuilder sb = new StringBuilder();
for (FileStatus file : files) {
String name = file.getPath().toString();
if (name.contains("_logs"))
continue;
if( sb.length()>0 )
sb.append(",");
sb.append(name);
}
return sb.toString();
}
/**
* Returns the size of a file or directory on hdfs in bytes.
*
* @param path
* @return
* @throws IOException
*/
public static long getFilesizeOnHDFS( Path path )
throws IOException
{
FileSystem fs = FileSystem.get(_rJob);
long ret = 0; //in bytes
if( fs.isDirectory(path) )
ret = fs.getContentSummary(path).getLength();
else
ret = fs.getFileStatus(path).getLen();
//note: filestatus would return 0 on directories
return ret;
}
private static BufferedReader setupInputFile ( String filename ) throws IOException {
Path pt=new Path(filename);
FileSystem fs = FileSystem.get(_rJob);
BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(pt)));
return br;
}
public static double readDoubleFromHDFSFile(String filename)
throws IOException
{
BufferedReader br = setupInputFile(filename);
String line = br.readLine();
br.close();
if( line == null )
throw new IOException("Empty file on hdfs: "+filename);
return Double.parseDouble(line);
}
public static long readIntegerFromHDFSFile(String filename)
throws IOException
{
BufferedReader br = setupInputFile(filename);
String line = br.readLine();
br.close();
if( line == null )
throw new IOException("Empty file on hdfs: "+filename);
return Long.parseLong(line);
}
public static boolean readBooleanFromHDFSFile(String filename)
throws IOException
{
BufferedReader br = setupInputFile(filename);
String line = br.readLine();
br.close();
if( line == null )
throw new IOException("Empty file on hdfs: "+filename);
return Boolean.parseBoolean(line);
}
public static String readStringFromHDFSFile(String filename)
throws IOException
{
BufferedReader br = setupInputFile(filename);
// handle multi-line strings in the HDFS file
StringBuilder sb = new StringBuilder();
String line = null;
while ( (line = br.readLine()) != null ) {
sb.append(line);
sb.append("\n");
}
br.close();
//return string without last character
return sb.substring(0, sb.length()-1);
}
private static BufferedWriter setupOutputFile ( String filename ) throws IOException {
Path pt=new Path(filename);
FileSystem fs = FileSystem.get(_rJob);
BufferedWriter br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true)));
return br;
}
public static void writeDoubleToHDFS ( double d, String filename ) throws IOException {
BufferedWriter br = setupOutputFile(filename);
String line = "" + d;
br.write(line);
br.close();
}
public static void writeIntToHDFS ( long i, String filename ) throws IOException {
BufferedWriter br = setupOutputFile(filename);
String line = "" + i;
br.write(line);
br.close();
}
public static void writeBooleanToHDFS ( boolean b, String filename ) throws IOException {
BufferedWriter br = setupOutputFile(filename);
String line = "" + b;
br.write(line);
br.close();
}
public static void writeStringToHDFS ( String s, String filename ) throws IOException {
BufferedWriter br = setupOutputFile(filename);
String line = "" + s;
br.write(line);
br.close();
}
public static void writeDimsFile ( String filename, byte[] unknownFlags, long[] maxRows, long[] maxCols) throws IOException {
BufferedWriter br = setupOutputFile(filename);
StringBuilder line = new StringBuilder();
for ( int i=0; i < unknownFlags.length; i++ ) {
if ( unknownFlags[i] != (byte)0 ) {
line.append(i);
line.append(" " + maxRows[i]);
line.append(" " + maxCols[i]);
line.append("\n");
}
}
br.write(line.toString());
br.close();
//System.out.println("Finished writing dimsFile: " + filename);
}
public static MatrixCharacteristics[] processDimsFiles(String dir, MatrixCharacteristics[] stats)
throws IOException
{
Path pt=new Path(dir);
FileSystem fs = FileSystem.get(_rJob);
if ( !fs.exists(pt) )
return stats;
FileStatus fstat = fs.getFileStatus(pt);
if ( fstat.isDirectory() )
{
FileStatus[] files = fs.listStatus(pt);
for ( int i=0; i < files.length; i++ ) {
Path filePath = files[i].getPath();
//System.out.println("Processing dims file: " + filePath.toString());
BufferedReader br = setupInputFile(filePath.toString());
String line = "";
while((line=br.readLine()) != null ) {
String[] parts = line.split(" ");
int resultIndex = Integer.parseInt(parts[0]);
long maxRows = Long.parseLong(parts[1]);
long maxCols = Long.parseLong(parts[2]);
stats[resultIndex].setDimension( (stats[resultIndex].getRows() < maxRows ? maxRows : stats[resultIndex].getRows()),
(stats[resultIndex].getCols() < maxCols ? maxCols : stats[resultIndex].getCols()) );
}
br.close();
}
}
else
{
throw new IOException(dir + " is expected to be a folder!");
}
return stats;
}
public static void writeMetaDataFile ( String mtdfile, ValueType v, MatrixCharacteristics mc, OutputInfo outinfo) throws IOException {
writeMetaDataFile(mtdfile, v, mc, outinfo, null);
}
public static void writeMetaDataFile( String mtdfile, ValueType v, MatrixCharacteristics mc, OutputInfo outinfo, FileFormatProperties formatProperties )
throws IOException
{
Path pt = new Path(mtdfile);
FileSystem fs = FileSystem.get(_rJob);
BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt,true)));
formatProperties = (formatProperties==null && outinfo==OutputInfo.CSVOutputInfo) ?
new CSVFileFormatProperties() : formatProperties;
String line = "";
try {
line += "{ \n" +
" \"" + DataExpression.DATATYPEPARAM + "\": \"matrix\"\n" +
" ,\"" + DataExpression.VALUETYPEPARAM + "\": ";
switch (v) {
case DOUBLE:
line += "\"double\"\n";
break;
case INT:
line += "\"int\"\n";
break;
case BOOLEAN:
line += "\"boolean\"\n";
break;
case STRING:
line += "\"string\"\n";
break;
case UNKNOWN:
line += "\"unknown\"\n";
break;
case OBJECT:
line += "\"object\"\n";
break;
};
line +=
" ,\"" + DataExpression.READROWPARAM + "\": " + mc.getRows() + "\n" +
" ,\"" + DataExpression.READCOLPARAM + "\": " + mc.getCols() + "\n";
// only output rows_in_block and cols_in_block for binary format
if ( outinfo == OutputInfo.BinaryBlockOutputInfo) {
line += " ,\"" + DataExpression.ROWBLOCKCOUNTPARAM + "\": " + mc.getRowsPerBlock() + "\n" +
" ,\"" + DataExpression.COLUMNBLOCKCOUNTPARAM + "\": " + mc.getColsPerBlock() + "\n";
}
line += " ,\"" + DataExpression.READNUMNONZEROPARAM + "\": " + mc.getNonZeros() + "\n" +
" ,\"" + DataExpression.FORMAT_TYPE + "\": ";
if ( outinfo == OutputInfo.TextCellOutputInfo ) {
line += "\"text\"\n";
} else if (outinfo == OutputInfo.BinaryBlockOutputInfo || outinfo == OutputInfo.BinaryCellOutputInfo ) {
line += "\"binary\"\n"; // currently, there is no way to differentiate between them
} else if (outinfo == OutputInfo.CSVOutputInfo ) {
line += "\"csv\"\n";
} else {
line += "\"specialized\"\n";
}
if ( outinfo == OutputInfo.CSVOutputInfo) {
CSVFileFormatProperties csvProperties = (CSVFileFormatProperties) formatProperties;
line += " ,\"" + DataExpression.DELIM_HAS_HEADER_ROW + "\": " + csvProperties.hasHeader() + "\n";
line += " ,\"" + DataExpression.DELIM_DELIMITER + "\": \"" + csvProperties.getDelim() + "\"\n";
}
line += " ,\"description\": { \"author\": \"SystemML\" } \n" + "}" ;
br.write(line);
br.close();
}catch (Exception e) {
throw new IOException(e);
}
}
public static void writeScalarMetaDataFile ( String mtdfile, ValueType v ) throws IOException {
Path pt=new Path(mtdfile);
FileSystem fs = FileSystem.get(_rJob);
BufferedWriter br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true)));
try {
String line = "";
line += "{ \n" +
" \"" + DataExpression.DATATYPEPARAM + "\": \"scalar\"\n" +
" ,\"" + DataExpression.VALUETYPEPARAM + "\": ";
switch (v) {
case DOUBLE:
line += "\"double\"\n";
break;
case INT:
line += "\"int\"\n";
break;
case BOOLEAN:
line += "\"boolean\"\n";
break;
case STRING:
line += "\"string\"\n";
break;
case UNKNOWN:
line += "\"unknown\"\n";
break;
case OBJECT:
throw new IOException("Write of generic object types not supported.");
};
line += " ,\"" + DataExpression.FORMAT_TYPE + "\": \"text\"\n" +
" ,\"description\": { \"author\": \"SystemML\" } \n" +" }" ;
br.write(line);
br.close();
}catch (Exception e) {
throw new IOException(e);
}
}
public static double[][] readMatrixFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen, int bclen)
throws IOException, DMLRuntimeException
{
MatrixReader reader = MatrixReaderFactory.createMatrixReader(inputinfo);
MatrixBlock mb = reader.readMatrixFromHDFS(dir, rlen, clen, brlen, bclen, rlen*clen);
return DataConverter.convertToDoubleMatrix(mb);
}
public static double[] readColumnVectorFromHDFS(String dir, InputInfo inputinfo, long rlen, long clen, int brlen, int bclen)
throws IOException, DMLRuntimeException
{
MatrixReader reader = MatrixReaderFactory.createMatrixReader(inputinfo);
MatrixBlock mb = reader.readMatrixFromHDFS(dir, rlen, clen, brlen, bclen, rlen*clen);
return DataConverter.convertToDoubleVector(mb);
}
public static double median(String dir, NumItemsByEachReducerMetaData metadata) throws IOException {
long[] counts=metadata.getNumItemsArray();
long[] ranges=new long[counts.length];
ranges[0]=counts[0];
for(int i=1; i<counts.length; i++)
ranges[i]=ranges[i-1]+counts[i];
long total=ranges[ranges.length-1];
return pickValueWeight(dir, metadata, 0.5, total%2==0)[0];
}
public static double pickValue(String dir, NumItemsByEachReducerMetaData metadata, double p) throws IOException {
return pickValueWeight(dir, metadata, p, false)[0];
}
public static double[] pickValueWeight(String dir, NumItemsByEachReducerMetaData metadata, double p, boolean average)
throws IOException
{
long[] counts=metadata.getNumItemsArray();
long[] ranges=new long[counts.length];
ranges[0]=counts[0];
for(int i=1; i<counts.length; i++)
ranges[i]=ranges[i-1]+counts[i];
long total=ranges[ranges.length-1];
// do averaging only if it is asked for; and sum_wt is even
average = average && (total%2 == 0);
int currentPart=0;
double cum_weight = 0;
long pos=(long)Math.ceil(total*p);
while(ranges[currentPart]<pos) {
currentPart++;
cum_weight += ranges[currentPart];
}
int offset;
if(currentPart>0)
offset=(int)(pos-ranges[currentPart-1]-1);
else
offset=(int)pos-1;
FileSystem fs=FileSystem.get(_rJob);
Path path=new Path(dir);
FileStatus[] files=fs.listStatus(path);
Path fileToRead=null;
for(FileStatus file: files)
if(file.getPath().toString().endsWith(Integer.toString(currentPart)))
{
fileToRead=file.getPath();
break;
}
if(fileToRead==null)
throw new RuntimeException("cannot read partition "+currentPart);
FSDataInputStream currentStream=fs.open(fileToRead);
DoubleWritable readKey=new DoubleWritable();
IntWritable readValue=new IntWritable();
boolean contain0s=false;
long numZeros=0;
if(currentPart==metadata.getPartitionOfZero())
{
contain0s=true;
numZeros=metadata.getNumberOfZero();
}
ReadWithZeros reader=new ReadWithZeros(currentStream, contain0s, numZeros);
int numRead=0;
while(numRead<=offset)
{
reader.readNextKeyValuePairs(readKey, readValue);
numRead+=readValue.get();
cum_weight += readValue.get();
}
double ret = readKey.get();
if(average) {
if(numRead<=offset+1) {
reader.readNextKeyValuePairs(readKey, readValue);
cum_weight += readValue.get();
ret = (ret+readKey.get())/2;
}
}
currentStream.close();
return new double[] {ret, (average ? -1 : readValue.get()), (average ? -1 : cum_weight)};
}
/**
*
* @param name
* @return
*/
public static int extractNumberFromOutputFile(String name)
{
int i=name.indexOf("part-");
assert(i>=0);
return Integer.parseInt(name.substring(i+5));
}
/**
*
* @param dir
* @param permissions
* @throws IOException
*/
public static void createDirIfNotExistOnHDFS(String dir, String permissions)
throws IOException
{
Path path = new Path(dir);
try {
FileSystem fs = FileSystem.get(_rJob);
if( !fs.exists(path) )
{
char[] c = permissions.toCharArray();
short sU = (short)((c[0]-48) * 64);
short sG = (short)((c[1]-48) * 8);
short sO = (short)((c[2]-48));
short mode = (short)(sU + sG + sO);
FsPermission perm = new FsPermission(mode);
fs.mkdirs(path, perm);
}
}
catch (Exception ex){
throw new IOException("Failed in creating a non existing dir on HDFS", ex);
}
//NOTE: we depend on the configured umask, setting umask in job or fspermission has no effect
//similarly setting dfs.datanode.data.dir.perm as no effect either.
}
/**
*
* @param filename
* @param overwrite
* @return
* @throws IOException
*/
public static FSDataOutputStream getHDFSDataOutputStream(String filename, boolean overwrite)
throws IOException
{
FileSystem fs = FileSystem.get(_rJob);
Path path = new Path(filename);
return fs.create(path, overwrite);
}
}