/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.matrix.mapred; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Map.Entry; import java.util.TreeMap; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.lib.CombineSequenceFileInputFormat; import org.apache.hadoop.mapred.lib.MultipleOutputs; import org.apache.hadoop.mapred.lib.NullOutputFormat; import org.apache.sysml.api.DMLScript; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.conf.DMLConfig; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.lops.Lop; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat; import org.apache.sysml.runtime.controlprogram.parfor.stat.InfrastructureAnalyzer; import org.apache.sysml.runtime.controlprogram.parfor.util.IDSequence; import org.apache.sysml.runtime.instructions.Instruction; import org.apache.sysml.runtime.instructions.MRInstructionParser; import org.apache.sysml.runtime.instructions.mr.AggregateBinaryInstruction; import org.apache.sysml.runtime.instructions.mr.AggregateInstruction; import org.apache.sysml.runtime.instructions.mr.AppendGInstruction; import org.apache.sysml.runtime.instructions.mr.AppendMInstruction; import org.apache.sysml.runtime.instructions.mr.BinaryMInstruction; import org.apache.sysml.runtime.instructions.mr.CM_N_COVInstruction; import org.apache.sysml.runtime.instructions.mr.CSVReblockInstruction; import org.apache.sysml.runtime.instructions.mr.CSVWriteInstruction; import org.apache.sysml.runtime.instructions.mr.DataGenMRInstruction; import org.apache.sysml.runtime.instructions.mr.GroupedAggregateInstruction; import org.apache.sysml.runtime.instructions.mr.MRInstruction; import org.apache.sysml.runtime.instructions.mr.MapMultChainInstruction; import org.apache.sysml.runtime.instructions.mr.PMMJMRInstruction; import org.apache.sysml.runtime.instructions.mr.ReblockInstruction; import org.apache.sysml.runtime.instructions.mr.RemoveEmptyMRInstruction; import org.apache.sysml.runtime.instructions.mr.UnaryMRInstructionBase; import org.apache.sysml.runtime.io.BinaryBlockSerialization; import org.apache.sysml.runtime.matrix.MatrixCharacteristics; import org.apache.sysml.runtime.matrix.data.AddDummyWeightConverter; import org.apache.sysml.runtime.matrix.data.BinaryBlockToBinaryCellConverter; import org.apache.sysml.runtime.matrix.data.BinaryBlockToRowBlockConverter; import org.apache.sysml.runtime.matrix.data.BinaryBlockToTextCellConverter; import org.apache.sysml.runtime.matrix.data.BinaryCellToRowBlockConverter; import org.apache.sysml.runtime.matrix.data.BinaryCellToTextConverter; import org.apache.sysml.runtime.matrix.data.Converter; import org.apache.sysml.runtime.matrix.data.IdenticalConverter; import org.apache.sysml.runtime.matrix.data.InputInfo; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.matrix.data.MatrixCell; import org.apache.sysml.runtime.matrix.data.MatrixValue; import org.apache.sysml.runtime.matrix.data.MultipleOutputCommitter; import org.apache.sysml.runtime.matrix.data.OutputInfo; import org.apache.sysml.runtime.matrix.data.TextCellToRowBlockConverter; import org.apache.sysml.runtime.matrix.data.TextToBinaryCellConverter; import org.apache.sysml.runtime.matrix.data.WeightedCellToSortInputConverter; import org.apache.sysml.runtime.matrix.data.WeightedPair; import org.apache.sysml.runtime.matrix.data.hadoopfix.MultipleInputs; import org.apache.sysml.runtime.matrix.sort.SamplingSortMRInputFormat; import org.apache.sysml.runtime.util.MapReduceTool; import org.apache.sysml.yarn.ropt.YarnClusterAnalyzer; @SuppressWarnings({ "rawtypes", "deprecation" }) public class MRJobConfiguration { //internal param: custom deserializer/serializer (usually 30% faster than WritableSerialization) public static final boolean USE_BINARYBLOCK_SERIALIZATION = true; //Job configurations public static IDSequence seq = new IDSequence(); //input matrices private static final String INPUT_MATRICIES_DIRS_CONFIG="input.matrices.dirs"; //this is here to handle record reader instructions private static final String MAPFUNC_INPUT_MATRICIES_INDEXES_CONFIG="mapfuc.input.matrices.indexes"; //about the formats of inputs private static final String BLOCK_REPRESENTATION_CONFIG="in.block.representation"; private static final String WEIGHTEDCELL_REPRESENTATION_CONFIG="in.weighted.cell.representation"; private static final String INPUT_CONVERTER_CLASS_PREFIX_CONFIG="input.converter.class.for."; private static final String INPUT_KEY_CLASS_PREFIX_CONFIG="input.key.class.for."; private static final String INPUT_VALUE_CLASS_PREFIX_CONFIG="input.value.class.for."; //characteristics about input matrices private static final String INPUT_MATRIX_NUM_ROW_PREFIX_CONFIG="input.matrix.num.row."; private static final String INPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG="input.matrix.num.column."; private static final String INPUT_BLOCK_NUM_ROW_PREFIX_CONFIG="input.block.num.row."; private static final String INPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG="input.block.num.column."; private static final String INPUT_MATRIX_NUM_NNZ_PREFIX_CONFIG="input.matrix.num.nnz."; //characteristics about the matrices to map outputs private static final String MAPOUTPUT_MATRIX_NUM_ROW_PREFIX_CONFIG="map.output.matrix.num.row."; private static final String MAPOUTPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG="map.output.matrix.num.column."; private static final String MAPOUTPUT_BLOCK_NUM_ROW_PREFIX_CONFIG="map.output.block.num.row."; private static final String MAPOUTPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG="map.output.block.num.column."; //operations performed in the mapper private static final String INSTRUCTIONS_IN_MAPPER_CONFIG="instructions.in.mapper"; private static final String RAND_INSTRUCTIONS_CONFIG="rand.instructions"; //matrix indexes to be outputted to reducer private static final String OUTPUT_INDEXES_IN_MAPPER_CONFIG="output.indexes.in.mapper"; //parfor serialized program private static final String PARFOR_PROGRAMBLOCKS_CONFIG = "parfor.programblocks.in.mr"; private static final String PARFOR_CACHING_CONFIG = "parfor.cp.caching"; //partitioning input/output info private static final String PARTITIONING_INPUT_MATRIX_NUM_ROW_CONFIG="partitioning.input.matrix.num.row"; private static final String PARTITIONING_INPUT_MATRIX_NUM_COLUMN_CONFIG="partitioning.input.matrix.num.column"; private static final String PARTITIONING_INPUT_BLOCK_NUM_ROW_CONFIG="partitioning.input.block.num.row"; private static final String PARTITIONING_INPUT_BLOCK_NUM_COLUMN_CONFIG="partitioning.input.block.num.column"; private static final String PARTITIONING_INPUT_INFO_CONFIG="partitioning.input.inputinfo"; private static final String PARTITIONING_OUTPUT_INFO_CONFIG="partitioning.output.outputinfo"; private static final String PARTITIONING_OUTPUT_FORMAT_CONFIG="partitioning.output.format"; private static final String PARTITIONING_OUTPUT_N_CONFIG="partitioning.output.n"; private static final String PARTITIONING_OUTPUT_FILENAME_CONFIG="partitioning.output.filename"; private static final String PARTITIONING_ITERVAR_CONFIG="partitioning.itervar"; private static final String PARTITIONING_MATRIXVAR_CONFIG="partitioning.matrixvar"; private static final String PARTITIONING_TRANSPOSE_COL_CONFIG="partitioning.transposed.col"; private static final String PARTITIONING_OUTPUT_KEEP_INDEXES_CONFIG="partitioning.output.keep.indexes"; //result merge info private static final String RESULTMERGE_INPUT_INFO_CONFIG="resultmerge.input.inputinfo"; private static final String RESULTMERGE_COMPARE_FILENAME_CONFIG="resultmerge.compare.filename"; private static final String RESULTMERGE_STAGING_DIR_CONFIG="resultmerge.staging.dir"; private static final String RESULTMERGE_MATRIX_NUM_ROW_CONFIG="resultmerge.matrix.num.row"; private static final String RESULTMERGE_MATRIX_NUM_COLUMN_CONFIG="resultmerge.matrix.num.column"; private static final String RESULTMERGE_BLOCK_NUM_ROW_CONFIG="resultmerge.block.num.row"; private static final String RESULTMERGE_BLOCK_NUM_COLUMN_CONFIG="resultmerge.block.num.column"; private static final String SORT_PARTITION_FILENAME = "sort.partition.filename"; //operations performed in the reduer private static final String AGGREGATE_INSTRUCTIONS_CONFIG="aggregate.instructions.after.groupby.at"; private static final String INSTRUCTIONS_IN_REDUCER_CONFIG="instructions.in.reducer"; private static final String AGGREGATE_BINARY_INSTRUCTIONS_CONFIG="aggregate.binary.instructions"; private static final String REBLOCK_INSTRUCTIONS_CONFIG="reblock.instructions"; private static final String CSV_REBLOCK_INSTRUCTIONS_CONFIG="csv.reblock.instructions"; private static final String CSV_WRITE_INSTRUCTIONS_CONFIG="csv.write.instructions"; private static final String COMBINE_INSTRUCTIONS_CONFIG="combine.instructions"; private static final String CM_N_COV_INSTRUCTIONS_CONFIG="cm_n_com.instructions"; private static final String GROUPEDAGG_INSTRUCTIONS_CONFIG="groupedagg.instructions"; //characteristics about the matrices to aggregate binary instructions private static final String AGGBIN_MATRIX_NUM_ROW_PREFIX_CONFIG="aggbin.matrix.num.row."; private static final String AGGBIN_MATRIX_NUM_COLUMN_PREFIX_CONFIG="aggbin.matrix.num.column."; private static final String AGGBIN_BLOCK_NUM_ROW_PREFIX_CONFIG="aggbin.block.num.row."; private static final String AGGBIN_BLOCK_NUM_COLUMN_PREFIX_CONFIG="aggbin.block.num.column."; //characteristics about the matrices to outputs private static final String OUTPUT_MATRIX_NUM_ROW_PREFIX_CONFIG="output.matrix.num.row."; private static final String OUTPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG="output.matrix.num.column."; private static final String OUTPUT_BLOCK_NUM_ROW_PREFIX_CONFIG="output.block.num.row."; private static final String OUTPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG="output.block.num.column."; //characteristics about the matrices to reblock instructions private static final String REBLOCK_MATRIX_NUM_ROW_PREFIX_CONFIG="reblock.matrix.num.row."; private static final String REBLOCK_MATRIX_NUM_COLUMN_PREFIX_CONFIG="reblock.matrix.num.column."; private static final String REBLOCK_BLOCK_NUM_ROW_PREFIX_CONFIG="reblock.block.num.row."; private static final String REBLOCK_BLOCK_NUM_COLUMN_PREFIX_CONFIG="reblock.block.num.column."; private static final String REBLOCK_MATRIX_NUM_NNZ_PREFIX_CONFIG="reblock.matrix.num.nnz."; //characteristics about the matrices to matrixdiag instructions private static final String INTERMEDIATE_MATRIX_NUM_ROW_PREFIX_CONFIG="rdiag.matrix.num.row."; private static final String INTERMEDIATE_MATRIX_NUM_COLUMN_PREFIX_CONFIG="rdiag.matrix.num.column."; private static final String INTERMEDIATE_BLOCK_NUM_ROW_PREFIX_CONFIG="rdiag.block.num.row."; private static final String INTERMEDIATE_BLOCK_NUM_COLUMN_PREFIX_CONFIG="rdiag.block.num.column."; //matrix indexes to be outputted as final results private static final String RESULT_INDEXES_CONFIG="results.indexes"; private static final String RESULT_DIMS_UNKNOWN_CONFIG="results.dims.unknown"; private static final String INTERMEDIATE_INDEXES_CONFIG="rdiag.indexes"; //output matrices and their formats public static final String OUTPUT_MATRICES_DIRS_CONFIG="output.matrices.dirs"; private static final String OUTPUT_CONVERTER_CLASS_PREFIX_CONFIG="output.converter.class.for."; private static final String DIMS_UNKNOWN_FILE_PREFIX = "dims.unknown.file.prefix"; private static final String MMCJ_CACHE_SIZE="mmcj.cache.size"; private static final String DISTCACHE_INPUT_INDICES="distcache.input.indices"; private static final String DISTCACHE_INPUT_PATHS = "distcache.input.paths"; private static final String SYSTEMML_LOCAL_TMP_DIR = "systemml.local.tmp.dir"; /* * SystemML Counter Group names * * group name for the counters on number of output nonZeros */ public static final String NUM_NONZERO_CELLS="nonzeros"; public static final String TF_NUM_COLS = "transform.num.columns"; public static final String TF_HAS_HEADER = "transform.has.header"; public static final String TF_DELIM = "transform.field.delimiter"; public static final String TF_NA_STRINGS = "transform.na.strings"; public static final String TF_HEADER = "transform.header.line"; public static final String TF_SPEC = "transform.specification"; public static final String TF_TMP_LOC = "transform.temp.location"; public static final String TF_TRANSFORM = "transform.omit.na.rows"; public static final String TF_SMALLEST_FILE= "transform.smallest.file"; public static final String TF_OFFSETS_FILE = "transform.offsets.file"; public static final String TF_TXMTD_PATH = "transform.txmtd.path"; /*public static enum DataTransformJobProperty { RCD_NUM_COLS("recode.num.columns"); private final String name; private DataTransformJobProperty(String n) { name = n; } }*/ public static enum DataTransformCounters { TRANSFORMED_NUM_ROWS }; public static final int getMiscMemRequired(JobConf job) { return job.getInt(MRConfigurationNames.IO_FILE_BUFFER_SIZE, 4096); } public static void setMMCJCacheSize(JobConf job, long size) { job.setLong(MMCJ_CACHE_SIZE, size); } public static long getMMCJCacheSize(JobConf job) { return job.getLong(MMCJ_CACHE_SIZE, 0); } public static void setMatrixValueClass(JobConf job, boolean blockRepresentation) { job.setBoolean(BLOCK_REPRESENTATION_CONFIG, blockRepresentation); } public static void setMatrixValueClassForCM_N_COM(JobConf job, boolean weightedCellRepresentation) { job.setBoolean(WEIGHTEDCELL_REPRESENTATION_CONFIG, weightedCellRepresentation); } public static Class<? extends MatrixValue> getMatrixValueClass(JobConf job) { if(job.getBoolean(WEIGHTEDCELL_REPRESENTATION_CONFIG, false)) return WeightedPair.class; if(job.getBoolean(BLOCK_REPRESENTATION_CONFIG, true)) return MatrixBlock.class; else return MatrixCell.class; } public static enum ConvertTarget{CELL, BLOCK, WEIGHTEDCELL, CSVWRITE} public static Class<? extends Converter> getConverterClass(InputInfo inputinfo, int brlen, int bclen, ConvertTarget target) { Class<? extends Converter> converterClass=IdenticalConverter.class; if(inputinfo.inputValueClass.equals(MatrixCell.class)) { switch (target) { case CELL: converterClass=IdenticalConverter.class; break; case BLOCK: throw new RuntimeException("cannot convert binary cell to binary block representation implicitly"); case WEIGHTEDCELL: converterClass=AddDummyWeightConverter.class; break; case CSVWRITE: converterClass=BinaryCellToRowBlockConverter.class; break; } }else if(inputinfo.inputValueClass.equals(MatrixBlock.class)) { switch (target) { case CELL: converterClass=BinaryBlockToBinaryCellConverter.class; break; case BLOCK: converterClass=IdenticalConverter.class; break; case WEIGHTEDCELL: converterClass=AddDummyWeightConverter.class; break; case CSVWRITE: converterClass=BinaryBlockToRowBlockConverter.class; break; } }else if(inputinfo.inputValueClass.equals(Text.class)) { switch (target) { case CELL: converterClass=TextToBinaryCellConverter.class; break; case BLOCK: if(brlen>1 || bclen>1) throw new RuntimeException("cannot convert text cell to binary block representation implicitly"); else converterClass=TextToBinaryCellConverter.class; break; case WEIGHTEDCELL: converterClass=AddDummyWeightConverter.class; break; case CSVWRITE: converterClass=TextCellToRowBlockConverter.class; break; } } return converterClass; } /** * Unique working dirs required for thread-safe submission of parallel jobs; * otherwise job.xml and other files might be overridden (in local mode). * * @param job job configuration */ public static void setUniqueWorkingDir( JobConf job ) { if( InfrastructureAnalyzer.isLocalMode(job) ) { StringBuilder tmp = new StringBuilder(); tmp.append( Lop.FILE_SEPARATOR ); tmp.append( Lop.PROCESS_PREFIX ); tmp.append( DMLScript.getUUID() ); tmp.append( Lop.FILE_SEPARATOR ); tmp.append( seq.getNextID() ); String uniqueSubdir = tmp.toString(); //unique local dir String[] dirlist = job.get(MRConfigurationNames.MR_CLUSTER_LOCAL_DIR,"/tmp").split(","); StringBuilder sb2 = new StringBuilder(); for( String dir : dirlist ) { if( sb2.length()>0 ) sb2.append(","); sb2.append(dir); sb2.append( uniqueSubdir ); } job.set(MRConfigurationNames.MR_CLUSTER_LOCAL_DIR, sb2.toString() ); //unique system dir job.set(MRConfigurationNames.MR_JOBTRACKER_SYSTEM_DIR, job.get(MRConfigurationNames.MR_JOBTRACKER_SYSTEM_DIR) + uniqueSubdir); //unique staging dir job.set( MRConfigurationNames.MR_JOBTRACKER_STAGING_ROOT_DIR, job.get(MRConfigurationNames.MR_JOBTRACKER_STAGING_ROOT_DIR) + uniqueSubdir ); } } public static String getLocalWorkingDirPrefix(JobConf job) { return job.get(MRConfigurationNames.MR_CLUSTER_LOCAL_DIR); } public static String getSystemWorkingDirPrefix(JobConf job) { return job.get(MRConfigurationNames.MR_JOBTRACKER_SYSTEM_DIR); } public static String getStagingWorkingDirPrefix(JobConf job) { return job.get(MRConfigurationNames.MR_JOBTRACKER_STAGING_ROOT_DIR); } public static void setInputInfo(JobConf job, byte input, InputInfo inputinfo, int brlen, int bclen, ConvertTarget target) { Class<? extends Converter> converterClass=getConverterClass(inputinfo, brlen, bclen, target); job.setClass(INPUT_CONVERTER_CLASS_PREFIX_CONFIG+input, converterClass, Converter.class); job.setClass(INPUT_KEY_CLASS_PREFIX_CONFIG+input, inputinfo.inputKeyClass, Writable.class); job.setClass(INPUT_VALUE_CLASS_PREFIX_CONFIG+input, inputinfo.inputValueClass, Writable.class); } public static void setOutputInfo(JobConf job, int i, OutputInfo outputinfo, boolean sourceInBlock) throws DMLRuntimeException { Class<? extends Converter> converterClass; if(sourceInBlock) { if(outputinfo.outputValueClass.equals(MatrixCell.class)) converterClass=BinaryBlockToBinaryCellConverter.class; else if(outputinfo.outputValueClass.equals(Text.class)) converterClass=BinaryBlockToTextCellConverter.class; else if(outputinfo.outputValueClass.equals(MatrixBlock.class)) converterClass=IdenticalConverter.class; else if(outputinfo.outputValueClass.equals(IntWritable.class)) converterClass=WeightedCellToSortInputConverter.class; else if(outputinfo.outputValueClass.equals(WeightedPair.class)) converterClass=IdenticalConverter.class; else converterClass=IdenticalConverter.class; }else { if(outputinfo.outputValueClass.equals(MatrixCell.class)) converterClass=IdenticalConverter.class; else if(outputinfo.outputValueClass.equals(Text.class)) converterClass=BinaryCellToTextConverter.class; else if(outputinfo.outputValueClass.equals(IntWritable.class)) converterClass=WeightedCellToSortInputConverter.class; else if(outputinfo.outputValueClass.equals(WeightedPair.class)) converterClass=IdenticalConverter.class; else throw new DMLRuntimeException("unsupported conversion: " + outputinfo.outputValueClass); // converterClass=IdenticalConverter.class; } job.setClass(OUTPUT_CONVERTER_CLASS_PREFIX_CONFIG+i, converterClass, Converter.class); } public static Converter getInputConverter(JobConf job, byte input) { Converter inputConverter; try { inputConverter=(Converter) job.getClass(INPUT_CONVERTER_CLASS_PREFIX_CONFIG+input, IdenticalConverter.class).newInstance(); } catch (Exception e) { throw new RuntimeException(e); } return inputConverter; } public static Converter getOuputConverter(JobConf job, int i) { Converter outputConverter; try { outputConverter=(Converter) job.getClass(OUTPUT_CONVERTER_CLASS_PREFIX_CONFIG+i, IdenticalConverter.class).newInstance(); } catch (Exception e) { throw new RuntimeException(e); } return outputConverter; } public static MRInstruction[] getInstructionsInReducer(JobConf job) throws DMLRuntimeException { String str=job.get(INSTRUCTIONS_IN_REDUCER_CONFIG); MRInstruction[] mixed_ops = MRInstructionParser.parseMixedInstructions(str); return mixed_ops; } public static ReblockInstruction[] getReblockInstructions(JobConf job) throws DMLRuntimeException { String str=job.get(REBLOCK_INSTRUCTIONS_CONFIG); ReblockInstruction[] reblock_instructions = MRInstructionParser.parseReblockInstructions(str); return reblock_instructions; } public static CSVReblockInstruction[] getCSVReblockInstructions(JobConf job) throws DMLRuntimeException { String str=job.get(CSV_REBLOCK_INSTRUCTIONS_CONFIG); CSVReblockInstruction[] reblock_instructions = MRInstructionParser.parseCSVReblockInstructions(str); return reblock_instructions; } public static CSVWriteInstruction[] getCSVWriteInstructions(JobConf job) throws DMLRuntimeException { String str=job.get(CSV_WRITE_INSTRUCTIONS_CONFIG); CSVWriteInstruction[] reblock_instructions = MRInstructionParser.parseCSVWriteInstructions(str); return reblock_instructions; } public static AggregateInstruction[] getAggregateInstructions(JobConf job) throws DMLRuntimeException { String str=job.get(AGGREGATE_INSTRUCTIONS_CONFIG); AggregateInstruction[] agg_instructions = MRInstructionParser.parseAggregateInstructions(str); return agg_instructions; } public static MRInstruction[] getCombineInstruction(JobConf job) throws DMLRuntimeException { String str=job.get(COMBINE_INSTRUCTIONS_CONFIG); MRInstruction[] comb_instructions = MRInstructionParser.parseCombineInstructions(str); return comb_instructions; } public static MRInstruction[] getInstructionsInMapper(JobConf job) throws DMLRuntimeException { String str=job.get(INSTRUCTIONS_IN_MAPPER_CONFIG); MRInstruction[] instructions = MRInstructionParser.parseMixedInstructions(str); return instructions; } //parfor configurations public static void setProgramBlocks(JobConf job, String sProgramBlocks) { job.set(PARFOR_PROGRAMBLOCKS_CONFIG, sProgramBlocks); } public static String getProgramBlocks(JobConf job) { String str = job.get(PARFOR_PROGRAMBLOCKS_CONFIG); return str; } public static void setParforCachingConfig(JobConf job, boolean flag) { job.setBoolean(PARFOR_CACHING_CONFIG, flag); } public static boolean getParforCachingConfig(JobConf job) { return job.getBoolean(PARFOR_CACHING_CONFIG, true); } //partitioning configurations public static void setPartitioningInfo( JobConf job, long rlen, long clen, int brlen, int bclen, InputInfo ii, OutputInfo oi, PDataPartitionFormat dpf, int n, String fnameNew ) throws DMLRuntimeException { job.set(PARTITIONING_INPUT_MATRIX_NUM_ROW_CONFIG, String.valueOf(rlen)); job.set(PARTITIONING_INPUT_MATRIX_NUM_COLUMN_CONFIG, String.valueOf(clen)); job.set(PARTITIONING_INPUT_BLOCK_NUM_ROW_CONFIG, String.valueOf(brlen)); job.set(PARTITIONING_INPUT_BLOCK_NUM_COLUMN_CONFIG, String.valueOf(bclen)); job.set(PARTITIONING_INPUT_INFO_CONFIG, InputInfo.inputInfoToString(ii)); job.set(PARTITIONING_OUTPUT_INFO_CONFIG, OutputInfo.outputInfoToString(oi)); job.set(PARTITIONING_OUTPUT_FORMAT_CONFIG, dpf.toString()); job.set(PARTITIONING_OUTPUT_N_CONFIG, String.valueOf(n)); job.set(PARTITIONING_OUTPUT_FILENAME_CONFIG, fnameNew); } public static void setPartitioningInfo( JobConf job, long rlen, long clen, int brlen, int bclen, InputInfo ii, OutputInfo oi, PDataPartitionFormat dpf, int n, String fnameNew, String itervar, String matrixvar, boolean tSparseCol ) throws DMLRuntimeException { //set basic partitioning information setPartitioningInfo(job, rlen, clen, brlen, bclen, ii, oi, dpf, n, fnameNew); //set iteration variable name (used for ParFor-DPE) job.set(PARTITIONING_ITERVAR_CONFIG, itervar); //set iteration variable name (used for ParFor-DPE) job.set(PARTITIONING_MATRIXVAR_CONFIG, matrixvar); //set transpose sparse column vector job.setBoolean(PARTITIONING_TRANSPOSE_COL_CONFIG, tSparseCol); } public static void setPartitioningInfo( JobConf job, long rlen, long clen, int brlen, int bclen, InputInfo ii, OutputInfo oi, PDataPartitionFormat dpf, int n, String fnameNew, boolean keepIndexes ) throws DMLRuntimeException { //set basic partitioning information setPartitioningInfo(job, rlen, clen, brlen, bclen, ii, oi, dpf, n, fnameNew); //set transpose sparse column vector job.setBoolean(PARTITIONING_OUTPUT_KEEP_INDEXES_CONFIG, keepIndexes); } public static long getPartitioningNumRows( JobConf job ) { return Long.parseLong(job.get(PARTITIONING_INPUT_MATRIX_NUM_ROW_CONFIG)); } public static long getPartitioningNumCols( JobConf job ) { return Long.parseLong(job.get(PARTITIONING_INPUT_MATRIX_NUM_COLUMN_CONFIG)); } public static void setPartitioningBlockNumRows( JobConf job, int brlen ) { job.set(PARTITIONING_INPUT_BLOCK_NUM_ROW_CONFIG, String.valueOf(brlen)); } public static int getPartitioningBlockNumRows( JobConf job ) { return Integer.parseInt(job.get(PARTITIONING_INPUT_BLOCK_NUM_ROW_CONFIG)); } public static void setPartitioningBlockNumCols( JobConf job, int bclen ) { job.set(PARTITIONING_INPUT_BLOCK_NUM_COLUMN_CONFIG,String.valueOf(bclen)); } public static int getPartitioningBlockNumCols( JobConf job ) { return Integer.parseInt(job.get(PARTITIONING_INPUT_BLOCK_NUM_COLUMN_CONFIG)); } public static InputInfo getPartitioningInputInfo( JobConf job ) { return InputInfo.stringToInputInfo(job.get(PARTITIONING_INPUT_INFO_CONFIG)); } public static OutputInfo getPartitioningOutputInfo( JobConf job ) { return OutputInfo.stringToOutputInfo(job.get(PARTITIONING_OUTPUT_INFO_CONFIG)); } public static void setPartitioningFormat( JobConf job, PDataPartitionFormat dpf ) { job.set(PARTITIONING_OUTPUT_FORMAT_CONFIG, dpf.toString()); } public static PDataPartitionFormat getPartitioningFormat( JobConf job ) { return PDataPartitionFormat.valueOf(job.get(PARTITIONING_OUTPUT_FORMAT_CONFIG)); } public static int getPartitioningSizeN( JobConf job ) { return Integer.parseInt(job.get(PARTITIONING_OUTPUT_N_CONFIG)); } public static boolean getPartitioningIndexFlag( JobConf job ) { return Boolean.parseBoolean(job.get(PARTITIONING_OUTPUT_KEEP_INDEXES_CONFIG)); } public static void setPartitioningFilename( JobConf job, String fname ) { job.set(PARTITIONING_OUTPUT_FILENAME_CONFIG, fname); } public static String getPartitioningFilename( JobConf job ) { return job.get(PARTITIONING_OUTPUT_FILENAME_CONFIG); } public static String getPartitioningItervar( JobConf job ) { return job.get(PARTITIONING_ITERVAR_CONFIG); } public static String getPartitioningMatrixvar( JobConf job ) { return job.get(PARTITIONING_MATRIXVAR_CONFIG); } public static boolean getPartitioningTransposedCol( JobConf job ) { return job.getBoolean(PARTITIONING_TRANSPOSE_COL_CONFIG, false); } public static void setResultMergeInfo( JobConf job, String fnameNew, InputInfo ii, String stagingDir, long rlen, long clen, int brlen, int bclen ) throws DMLRuntimeException { job.set(RESULTMERGE_COMPARE_FILENAME_CONFIG, fnameNew); job.set(RESULTMERGE_INPUT_INFO_CONFIG, InputInfo.inputInfoToString(ii)); job.set(RESULTMERGE_STAGING_DIR_CONFIG, stagingDir); job.set(RESULTMERGE_MATRIX_NUM_ROW_CONFIG, String.valueOf(rlen)); job.set(RESULTMERGE_MATRIX_NUM_COLUMN_CONFIG, String.valueOf(clen)); job.set(RESULTMERGE_BLOCK_NUM_ROW_CONFIG, String.valueOf(brlen)); job.set(RESULTMERGE_BLOCK_NUM_COLUMN_CONFIG, String.valueOf(bclen)); } public static String getResultMergeInfoCompareFilename( JobConf job ) { return job.get(RESULTMERGE_COMPARE_FILENAME_CONFIG); } public static InputInfo getResultMergeInputInfo( JobConf job ) { return InputInfo.stringToInputInfo( job.get(RESULTMERGE_INPUT_INFO_CONFIG) ); } public static long[] getResultMergeMatrixCharacteristics( JobConf job ) { long[] ret = new long[4]; ret[0] = Long.parseLong(job.get(RESULTMERGE_MATRIX_NUM_ROW_CONFIG)); ret[1] = Long.parseLong(job.get(RESULTMERGE_MATRIX_NUM_COLUMN_CONFIG)); ret[2] = Long.parseLong(job.get(RESULTMERGE_BLOCK_NUM_ROW_CONFIG)); ret[3] = Long.parseLong(job.get(RESULTMERGE_BLOCK_NUM_COLUMN_CONFIG)); return ret; } public static byte[] getInputIndexesInMapper(JobConf job) { String[] istrs=job.get(MAPFUNC_INPUT_MATRICIES_INDEXES_CONFIG).split(Instruction.INSTRUCTION_DELIM); return stringArrayToByteArray(istrs); } public static byte[] getOutputIndexesInMapper(JobConf job) { String[] istrs=job.get(OUTPUT_INDEXES_IN_MAPPER_CONFIG).split(Instruction.INSTRUCTION_DELIM); return stringArrayToByteArray(istrs); } //get the indexes that this matrix file represents, //since one matrix file can occur multiple times in a statement public static ArrayList<Byte> getInputMatrixIndexesInMapper(JobConf job) throws IOException { String[] matrices=job.getStrings(INPUT_MATRICIES_DIRS_CONFIG); String str=job.get(MAPFUNC_INPUT_MATRICIES_INDEXES_CONFIG); byte[] indexes; if(str==null || str.isEmpty()) { indexes=new byte[matrices.length]; for(int i=0; i<indexes.length; i++) indexes[i]=(byte)i; }else { String[] strs=str.split(Instruction.INSTRUCTION_DELIM); indexes=new byte[strs.length]; for(int i=0; i<strs.length; i++) indexes[i]=Byte.parseByte(strs[i]); } int numMatrices=matrices.length; if(numMatrices>Byte.MAX_VALUE) throw new RuntimeException("number of matrices is too large > "+Byte.MAX_VALUE); for(int i=0; i<matrices.length; i++) matrices[i]=new Path(matrices[i]).toString(); FileSystem fs=FileSystem.get(job); Path thisFile=new Path(job.get(MRConfigurationNames.MR_MAP_INPUT_FILE)).makeQualified(fs); //Path p=new Path(thisFileName); Path thisDir=thisFile.getParent().makeQualified(fs); ArrayList<Byte> representativeMatrixes=new ArrayList<Byte>(); for(int i=0; i<matrices.length; i++) { Path p = new Path(matrices[i]).makeQualified(fs); if(thisFile.toUri().equals(p.toUri()) || thisDir.toUri().equals(p.toUri())) representativeMatrixes.add(indexes[i]); } return representativeMatrixes; } /*public static void setMatrixToCacheInMMCJ(JobConf job, boolean left) { job.setBoolean(CACHE_LEFT_MATRIX_FOR_MMCJ_CONFIG, left); } public static boolean getMatrixToCacheInMMCJ(JobConf job) { return job.getBoolean(CACHE_LEFT_MATRIX_FOR_MMCJ_CONFIG, true); }*/ public static void setInstructionsInMapper(JobConf job, String instructionsInMapper) { job.set(INSTRUCTIONS_IN_MAPPER_CONFIG, instructionsInMapper); } public static void setAggregateInstructions(JobConf job, String aggInstructionsInReducer) { job.set(AGGREGATE_INSTRUCTIONS_CONFIG, aggInstructionsInReducer); } public static void setReblockInstructions(JobConf job, String reblockInstructions) { job.set(REBLOCK_INSTRUCTIONS_CONFIG, reblockInstructions); } public static void setCSVReblockInstructions(JobConf job, String reblockInstructions) { job.set(CSV_REBLOCK_INSTRUCTIONS_CONFIG, reblockInstructions); } public static void setCSVWriteInstructions(JobConf job, String csvWriteInstructions) { job.set(CSV_WRITE_INSTRUCTIONS_CONFIG, csvWriteInstructions); } public static void setCombineInstructions(JobConf job, String combineInstructions) { job.set(COMBINE_INSTRUCTIONS_CONFIG, combineInstructions); } public static void setInstructionsInReducer(JobConf job, String instructionsInReducer) { if(instructionsInReducer!=null) job.set(INSTRUCTIONS_IN_REDUCER_CONFIG, instructionsInReducer); } public static void setAggregateBinaryInstructions(JobConf job, String aggBinInstrctions) { job.set(AGGREGATE_BINARY_INSTRUCTIONS_CONFIG, aggBinInstrctions); } public static void setCM_N_COMInstructions(JobConf job, String cmInstrctions) { job.set(CM_N_COV_INSTRUCTIONS_CONFIG, cmInstrctions); } public static void setGroupedAggInstructions(JobConf job, String grpaggInstructions) { job.set(GROUPEDAGG_INSTRUCTIONS_CONFIG, grpaggInstructions); } public static void setRandInstructions(JobConf job, String randInstrctions) { job.set(RAND_INSTRUCTIONS_CONFIG, randInstrctions); } // TODO: check Rand public static DataGenMRInstruction[] getDataGenInstructions(JobConf job) throws DMLRuntimeException { String str=job.get(RAND_INSTRUCTIONS_CONFIG); return MRInstructionParser.parseDataGenInstructions(str); } public static AggregateBinaryInstruction[] getAggregateBinaryInstructions(JobConf job) throws DMLRuntimeException { String str=job.get(AGGREGATE_BINARY_INSTRUCTIONS_CONFIG); return MRInstructionParser.parseAggregateBinaryInstructions(str); } public static CM_N_COVInstruction[] getCM_N_COVInstructions(JobConf job) throws DMLRuntimeException { String str=job.get(CM_N_COV_INSTRUCTIONS_CONFIG); return MRInstructionParser.parseCM_N_COVInstructions(str); } public static GroupedAggregateInstruction[] getGroupedAggregateInstructions(JobConf job) throws DMLRuntimeException { //parse all grouped aggregate instructions String str=job.get(GROUPEDAGG_INSTRUCTIONS_CONFIG); GroupedAggregateInstruction[] tmp = MRInstructionParser.parseGroupedAggInstructions(str); //obtain bclen for all instructions for( int i=0; i< tmp.length; i++ ) { byte tag = tmp[i].input; tmp[i].setBclen(getMatrixCharacteristicsForInput(job, tag).getColsPerBlock()); } return tmp; } public static String[] getOutputs(JobConf job) { return job.getStrings(OUTPUT_MATRICES_DIRS_CONFIG); } private static byte[] stringArrayToByteArray(String[] istrs) { byte[] ret=new byte[istrs.length]; for(int i=0; i<istrs.length; i++) ret[i]=Byte.parseByte(istrs[i]); return ret; } public static byte[] getResultIndexes(JobConf job) { String[] istrs=job.get(RESULT_INDEXES_CONFIG).split(Instruction.INSTRUCTION_DELIM); return stringArrayToByteArray(istrs); } public static byte[] getResultDimsUnknown(JobConf job) { String str=job.get(RESULT_DIMS_UNKNOWN_CONFIG); if (str==null || str.isEmpty()) return null; String[] istrs=str.split(Instruction.INSTRUCTION_DELIM); return stringArrayToByteArray(istrs); } public static byte[] getIntermediateMatrixIndexes(JobConf job) { String str=job.get(INTERMEDIATE_INDEXES_CONFIG); if(str==null || str.isEmpty()) return null; String[] istrs=str.split(Instruction.INSTRUCTION_DELIM); return stringArrayToByteArray(istrs); } public static void setIntermediateMatrixIndexes(JobConf job, HashSet<Byte> indexes) { job.set(INTERMEDIATE_INDEXES_CONFIG, getIndexesString(indexes)); } public static void setDimsUnknownFilePrefix(JobConf job, String prefix) { job.setStrings(DIMS_UNKNOWN_FILE_PREFIX, prefix); } public static void setMatricesDimensions(JobConf job, byte[] inputIndexes, long[] rlens, long[] clens) { if(rlens.length!=clens.length) throw new RuntimeException("rlens.length should be clens.length"); for(int i=0; i<rlens.length; i++) setMatrixDimension(job, inputIndexes[i], rlens[i], clens[i]); } public static void setMatricesDimensions(JobConf job, byte[] inputIndexes, long[] rlens, long[] clens, long[] nnz) { if(rlens.length!=clens.length) throw new RuntimeException("rlens.length should be clens.length"); for(int i=0; i<rlens.length; i++) setMatrixDimension(job, inputIndexes[i], rlens[i], clens[i], nnz[i]); } public static void setMatrixDimension(JobConf job, byte matrixIndex, long rlen, long clen) { job.setLong(INPUT_MATRIX_NUM_ROW_PREFIX_CONFIG+matrixIndex, rlen); job.setLong(INPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG+matrixIndex, clen); } public static void setMatrixDimension(JobConf job, byte matrixIndex, long rlen, long clen, long nnz) { job.setLong(INPUT_MATRIX_NUM_ROW_PREFIX_CONFIG+matrixIndex, rlen); job.setLong(INPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG+matrixIndex, clen); job.setLong(INPUT_MATRIX_NUM_NNZ_PREFIX_CONFIG+matrixIndex, nnz); } public static String[] getInputPaths(JobConf job) { return job.getStrings(INPUT_MATRICIES_DIRS_CONFIG); } public static long getNumRows(JobConf job, byte matrixIndex) { return job.getLong(INPUT_MATRIX_NUM_ROW_PREFIX_CONFIG+matrixIndex, 0); } public static long getNumColumns(JobConf job, byte matrixIndex) { return job.getLong(INPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG+matrixIndex, 0); } public static void setBlocksSizes(JobConf job, byte[] inputIndexes, int[] brlens, int[] bclens) { if(brlens.length!=bclens.length) throw new RuntimeException("brlens.length should be bclens.length"); for(int i=0; i<brlens.length; i++) setBlockSize(job, inputIndexes[i], brlens[i], bclens[i]); } public static void setBlockSize(JobConf job, byte matrixIndex, int brlen, int bclen) { job.setInt(INPUT_BLOCK_NUM_ROW_PREFIX_CONFIG+matrixIndex, brlen); job.setInt(INPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG+matrixIndex, bclen); } public static int getNumRowsPerBlock(JobConf job, byte matrixIndex) { return job.getInt(INPUT_BLOCK_NUM_ROW_PREFIX_CONFIG+matrixIndex, 1); } public static int getNumColumnsPerBlock(JobConf job, byte matrixIndex) { return job.getInt(INPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG+matrixIndex, 1); } public static long getNumNonZero(JobConf job, byte matrixIndex) { return job.getLong(INPUT_MATRIX_NUM_NNZ_PREFIX_CONFIG+matrixIndex, 1); } public static void setupDistCacheInputs(JobConf job, String indices, String pathsString, ArrayList<String> paths) { job.set(DISTCACHE_INPUT_INDICES, indices); job.set(DISTCACHE_INPUT_PATHS, pathsString); Path p = null; for(String spath : paths) { p = new Path(spath); DistributedCache.addCacheFile(p.toUri(), job); DistributedCache.createSymlink(job); } } public static String getDistCacheInputIndices(JobConf job) { return job.get(DISTCACHE_INPUT_INDICES); } private static String getCSVString(PDataPartitionFormat[] formats) { if ( formats == null || formats.length == 0 ) return ""; StringBuilder s = new StringBuilder(); s.append(formats[0]); for(int i=1; i < formats.length; i++) { s.append(","); s.append(formats[i]); } return s.toString(); } public static void setInputPartitioningInfo(JobConf job, PDataPartitionFormat[] pformats) { job.set(PARTITIONING_OUTPUT_FORMAT_CONFIG, MRJobConfiguration.getCSVString(pformats)); } private static PDataPartitionFormat[] csv2PFormat(String s) { String[] parts = s.split(","); PDataPartitionFormat[] pformats = new PDataPartitionFormat[parts.length]; for(int i=0; i < parts.length; i++) { pformats[i] = PDataPartitionFormat.parsePDataPartitionFormat(parts[i]); } return pformats; } public static PDataPartitionFormat[] getInputPartitionFormats(JobConf job) { return MRJobConfiguration.csv2PFormat(job.get(PARTITIONING_OUTPUT_FORMAT_CONFIG)); } public static void setUpMultipleInputs(JobConf job, byte[] inputIndexes, String[] inputs, InputInfo[] inputInfos, int[] brlens, int[] bclens, boolean setConverter, ConvertTarget target) throws Exception { //conservative initialize (all jobs except GMR) boolean[] distCacheOnly = new boolean[inputIndexes.length]; Arrays.fill(distCacheOnly, false); setUpMultipleInputs(job, inputIndexes, inputs, inputInfos, brlens, bclens, distCacheOnly, setConverter, target); } public static void setUpMultipleInputs(JobConf job, byte[] inputIndexes, String[] inputs, InputInfo[] inputInfos, int[] brlens, int[] bclens, boolean[] distCacheOnly, boolean setConverter, ConvertTarget target) throws Exception { if(inputs.length!=inputInfos.length) throw new Exception("number of inputs and inputInfos does not match"); //set up names of the input matrices and their inputformat information job.setStrings(INPUT_MATRICIES_DIRS_CONFIG, inputs); MRJobConfiguration.setMapFunctionInputMatrixIndexes(job, inputIndexes); //set up converter infos (converter determined implicitly) if(setConverter) { for(int i=0; i<inputs.length; i++) setInputInfo(job, inputIndexes[i], inputInfos[i], brlens[i], bclens[i], target); } //remove redundant inputs and pure broadcast variables ArrayList<Path> lpaths = new ArrayList<Path>(); ArrayList<InputInfo> liinfos = new ArrayList<InputInfo>(); for(int i=0; i<inputs.length; i++) { Path p = new Path(inputs[i]); //check and skip redundant inputs if( lpaths.contains(p) //path already included || distCacheOnly[i] ) //input only required in dist cache { continue; } lpaths.add(p); liinfos.add(inputInfos[i]); } boolean combineInputFormat = false; if( OptimizerUtils.ALLOW_COMBINE_FILE_INPUT_FORMAT ) { //determine total input sizes double totalInputSize = 0; for(int i=0; i<inputs.length; i++) totalInputSize += MapReduceTool.getFilesizeOnHDFS(new Path(inputs[i])); //set max split size (default blocksize) to 2x blocksize if (1) sort buffer large enough, //(2) degree of parallelism not hurt, and only a single input (except broadcasts) //(the sort buffer size is relevant for pass-through of, potentially modified, inputs to the reducers) //(the single input constraint stems from internal runtime assumptions used to relate meta data to inputs) long sizeSortBuff = InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer(); long sizeHDFSBlk = InfrastructureAnalyzer.getHDFSBlockSize(); long newSplitSize = sizeHDFSBlk * 2; //use generic config api for backwards compatibility double spillPercent = Double.parseDouble(job.get(MRConfigurationNames.MR_MAP_SORT_SPILL_PERCENT, "1.0")); int numPMap = OptimizerUtils.getNumMappers(); if( numPMap < totalInputSize/newSplitSize && sizeSortBuff*spillPercent >= newSplitSize && lpaths.size()==1 ) { job.setLong(MRConfigurationNames.MR_INPUT_FILEINPUTFORMAT_SPLIT_MAXSIZE, newSplitSize); combineInputFormat = true; } } //add inputs to jobs input (incl input format configuration) for(int i=0; i<lpaths.size(); i++) { //add input to job inputs (for binaryblock we use CombineSequenceFileInputFormat to reduce task latency) if( combineInputFormat && liinfos.get(i) == InputInfo.BinaryBlockInputInfo ) MultipleInputs.addInputPath(job, lpaths.get(i), CombineSequenceFileInputFormat.class); else MultipleInputs.addInputPath(job, lpaths.get(i), liinfos.get(i).inputFormatClass); } } /** * Specific method because we need to set the input converter class according to the * input infos. Note that any mapper instruction before reblock can work on binary block * if it can work on binary cell as well. * * @param job job configuration * @param inputIndexes array of byte indexes * @param inputs array of input string * @param inputInfos array of input infos * @param brlens array of block row lengths * @param bclens array of block column lengths * @throws Exception if Exception occurs */ public static void setUpMultipleInputsReblock(JobConf job, byte[] inputIndexes, String[] inputs, InputInfo[] inputInfos, int[] brlens, int[] bclens) throws Exception { if(inputs.length!=inputInfos.length) throw new Exception("number of inputs and inputInfos does not match"); //set up names of the input matrices and their inputformat information job.setStrings(INPUT_MATRICIES_DIRS_CONFIG, inputs); MRJobConfiguration.setMapFunctionInputMatrixIndexes(job, inputIndexes); for(int i=0; i<inputs.length; i++) { ConvertTarget target=ConvertTarget.CELL; if(inputInfos[i]==InputInfo.BinaryBlockInputInfo) target=ConvertTarget.BLOCK; setInputInfo(job, inputIndexes[i], inputInfos[i], brlens[i], bclens[i], target); } //remove redundant input files ArrayList<Path> paths=new ArrayList<Path>(); for(int i=0; i<inputs.length; i++) { String name=inputs[i]; Path p=new Path(name); boolean redundant=false; for(Path ep: paths) if(ep.equals(p)) { redundant=true; break; } if(redundant) continue; MultipleInputs.addInputPath(job, p, inputInfos[i].inputFormatClass); paths.add(p); } } public static void setUpMultipleOutputs(JobConf job, byte[] resultIndexes, byte[] resultDimsUnknown, String[] outputs, OutputInfo[] outputInfos, boolean inBlockRepresentation, boolean mayContainCtable) throws Exception { if(resultIndexes.length!=outputs.length) throw new Exception("number of outputs and result indexes does not match"); if(outputs.length!=outputInfos.length) throw new Exception("number of outputs and outputInfos indexes does not match"); job.set(RESULT_INDEXES_CONFIG, MRJobConfiguration.getIndexesString(resultIndexes)); job.set(RESULT_DIMS_UNKNOWN_CONFIG, MRJobConfiguration.getIndexesString(resultDimsUnknown)); job.setStrings(OUTPUT_MATRICES_DIRS_CONFIG, outputs); job.setOutputCommitter(MultipleOutputCommitter.class); for(int i=0; i<outputs.length; i++) { MapReduceTool.deleteFileIfExistOnHDFS(new Path(outputs[i]), job); if ( mayContainCtable && resultDimsUnknown[i] == (byte) 1 ) { setOutputInfo(job, i, outputInfos[i], false); } else { setOutputInfo(job, i, outputInfos[i], inBlockRepresentation); } MultipleOutputs.addNamedOutput(job, Integer.toString(i), outputInfos[i].outputFormatClass, outputInfos[i].outputKeyClass, outputInfos[i].outputValueClass); } job.setOutputFormat(NullOutputFormat.class); // configure temp output Path tempOutputPath = new Path( constructTempOutputFilename() ); FileOutputFormat.setOutputPath(job, tempOutputPath); MapReduceTool.deleteFileIfExistOnHDFS(tempOutputPath, job); } public static void setUpMultipleOutputs(JobConf job, byte[] resultIndexes, byte[] resultDimsUnknwon, String[] outputs, OutputInfo[] outputInfos, boolean inBlockRepresentation) throws Exception { setUpMultipleOutputs(job, resultIndexes, resultDimsUnknwon, outputs, outputInfos, inBlockRepresentation, false); } public static String setUpSortPartitionFilename( JobConf job ) { String pfname = constructPartitionFilename(); job.set( SORT_PARTITION_FILENAME, pfname ); return pfname; } public static String getSortPartitionFilename( JobConf job ) { return job.get( SORT_PARTITION_FILENAME ); } public static MatrixChar_N_ReducerGroups computeMatrixCharacteristics(JobConf job, byte[] inputIndexes, String instructionsInMapper, String aggInstructionsInReducer, String aggBinInstructions, String otherInstructionsInReducer, byte[] resultIndexes, HashSet<Byte> mapOutputIndexes, boolean forMMCJ) throws DMLRuntimeException { return computeMatrixCharacteristics(job, inputIndexes, null, instructionsInMapper, null, aggInstructionsInReducer, aggBinInstructions, otherInstructionsInReducer, resultIndexes, mapOutputIndexes, forMMCJ); } public static MatrixChar_N_ReducerGroups computeMatrixCharacteristics(JobConf job, byte[] inputIndexes, String instructionsInMapper, String reblockInstructions, String aggInstructionsInReducer, String aggBinInstructions, String otherInstructionsInReducer, byte[] resultIndexes, HashSet<Byte> mapOutputIndexes, boolean forMMCJ) throws DMLRuntimeException { return computeMatrixCharacteristics(job, inputIndexes, null, instructionsInMapper, reblockInstructions, aggInstructionsInReducer, aggBinInstructions, otherInstructionsInReducer, resultIndexes, mapOutputIndexes, forMMCJ); } public static void setNumReducers(JobConf job, long numReducerGroups, int numFromCompiler) throws IOException { JobClient client=new JobClient(job); int n=client.getClusterStatus().getMaxReduceTasks(); //correction max number of reducers on yarn clusters if( InfrastructureAnalyzer.isYarnEnabled() ) n = (int)Math.max( n, YarnClusterAnalyzer.getNumCores()/2 ); n=Math.min(n, ConfigurationManager.getNumReducers()); n=Math.min(n, numFromCompiler); if(numReducerGroups>0) n=(int) Math.min(n, numReducerGroups); job.setNumReduceTasks(n); } public static class MatrixChar_N_ReducerGroups { public MatrixCharacteristics[] stats; public long numReducerGroups=0; public MatrixChar_N_ReducerGroups(MatrixCharacteristics[] sts, long ng) { stats=sts; numReducerGroups=ng; } } /** * NOTE: this method needs to be in-sync with MRBaseForCommonInstructions.processOneInstruction, * otherwise, the latter will potentially fail with missing dimension information. * * @param job job configuration * @param inputIndexes array of byte indexes * @param dataGenInstructions data gen instructions as a string * @param instructionsInMapper instruction in mapper as a string * @param reblockInstructions reblock instructions as a string * @param aggInstructionsInReducer aggregate instructions in reducer as a string * @param aggBinInstructions binary aggregate instructions as a string * @param otherInstructionsInReducer other instructions in reducer as a string * @param resultIndexes array of byte result indexes * @param mapOutputIndexes set of map output indexes * @param forMMCJ ? * @return reducer groups * @throws DMLRuntimeException if DMLRuntimeException occurs */ public static MatrixChar_N_ReducerGroups computeMatrixCharacteristics(JobConf job, byte[] inputIndexes, String dataGenInstructions, String instructionsInMapper, String reblockInstructions, String aggInstructionsInReducer, String aggBinInstructions, String otherInstructionsInReducer, byte[] resultIndexes, HashSet<Byte> mapOutputIndexes, boolean forMMCJ) throws DMLRuntimeException { HashSet<Byte> intermediateMatrixIndexes=new HashSet<Byte>(); HashMap<Byte, MatrixCharacteristics> dims=new HashMap<Byte, MatrixCharacteristics>(); for(byte i: inputIndexes){ MatrixCharacteristics dim=new MatrixCharacteristics(getNumRows(job, i), getNumColumns(job, i), getNumRowsPerBlock(job, i), getNumColumnsPerBlock(job, i), getNumNonZero(job, i)); dims.put(i, dim); } DataGenMRInstruction[] dataGenIns = null; dataGenIns = MRInstructionParser.parseDataGenInstructions(dataGenInstructions); if(dataGenIns!=null) { for(DataGenMRInstruction ins: dataGenIns) { MatrixCharacteristics.computeDimension(dims, ins); } } MRInstruction[] insMapper = MRInstructionParser.parseMixedInstructions(instructionsInMapper); if(insMapper!=null) { for(MRInstruction ins: insMapper) { MatrixCharacteristics.computeDimension(dims, ins); if( ins instanceof UnaryMRInstructionBase ) { UnaryMRInstructionBase tempIns=(UnaryMRInstructionBase) ins; setIntermediateMatrixCharactristics(job, tempIns.input, dims.get(tempIns.input)); intermediateMatrixIndexes.add(tempIns.input); } else if(ins instanceof AppendMInstruction) { AppendMInstruction tempIns=(AppendMInstruction) ins; setIntermediateMatrixCharactristics(job, tempIns.input1, dims.get(tempIns.input1)); intermediateMatrixIndexes.add(tempIns.input1); } else if(ins instanceof AppendGInstruction) { AppendGInstruction tempIns=(AppendGInstruction) ins; setIntermediateMatrixCharactristics(job, tempIns.input1, dims.get(tempIns.input1)); intermediateMatrixIndexes.add(tempIns.input1); } else if(ins instanceof BinaryMInstruction) { BinaryMInstruction tempIns=(BinaryMInstruction) ins; setIntermediateMatrixCharactristics(job, tempIns.input1, dims.get(tempIns.input1)); intermediateMatrixIndexes.add(tempIns.input1); } else if(ins instanceof AggregateBinaryInstruction) { AggregateBinaryInstruction tempIns=(AggregateBinaryInstruction) ins; setIntermediateMatrixCharactristics(job, tempIns.input1, dims.get(tempIns.input1)); intermediateMatrixIndexes.add(tempIns.input1); //TODO } else if(ins instanceof MapMultChainInstruction) { MapMultChainInstruction tempIns=(MapMultChainInstruction) ins; setIntermediateMatrixCharactristics(job, tempIns.getInput1(), dims.get(tempIns.getInput2())); intermediateMatrixIndexes.add(tempIns.getInput1()); } else if(ins instanceof PMMJMRInstruction) { PMMJMRInstruction tempIns=(PMMJMRInstruction) ins; setIntermediateMatrixCharactristics(job, tempIns.input2, dims.get(tempIns.input2)); intermediateMatrixIndexes.add(tempIns.input2); } } } ReblockInstruction[] reblockIns = MRInstructionParser.parseReblockInstructions(reblockInstructions); if(reblockIns!=null) { for(ReblockInstruction ins: reblockIns) { MatrixCharacteristics.computeDimension(dims, ins); setMatrixCharactristicsForReblock(job, ins.output, dims.get(ins.output)); } } Instruction[] aggIns = MRInstructionParser.parseAggregateInstructions(aggInstructionsInReducer); if(aggIns!=null) { for(Instruction ins: aggIns) { MatrixCharacteristics.computeDimension(dims, (MRInstruction) ins); // if instruction's output is not in resultIndexes, then add its dimensions to jobconf MRInstruction mrins = (MRInstruction)ins; boolean found = false; for(byte b : resultIndexes) { if(b==mrins.output) { found = true; break; } } if(!found) { setIntermediateMatrixCharactristics(job, mrins.output, dims.get(mrins.output)); intermediateMatrixIndexes.add(mrins.output); } } } long numReduceGroups=0; AggregateBinaryInstruction[] aggBinIns = getAggregateBinaryInstructions(job); if(aggBinIns!=null) { for(AggregateBinaryInstruction ins: aggBinIns) { MatrixCharacteristics dim1=dims.get(ins.input1); MatrixCharacteristics dim2=dims.get(ins.input2); setMatrixCharactristicsForBinAgg(job, ins.input1, dim1); setMatrixCharactristicsForBinAgg(job, ins.input2, dim2); MatrixCharacteristics.computeDimension(dims, ins); if(forMMCJ)//there will be only one aggbin operation for MMCJ numReduceGroups=(long) Math.ceil((double)dim1.getCols()/(double)dim1.getColsPerBlock()); } } if(!forMMCJ) { //store the skylines ArrayList<Long> xs=new ArrayList<Long>(mapOutputIndexes.size()); ArrayList<Long> ys=new ArrayList<Long>(mapOutputIndexes.size()); for(byte idx: mapOutputIndexes) { MatrixCharacteristics dim=dims.get(idx); long x=(long)Math.ceil((double)dim.getRows()/(double)dim.getRowsPerBlock()); long y=(long)Math.ceil((double)dim.getCols()/(double)dim.getColsPerBlock()); int i=0; boolean toadd=true; while(i<xs.size()) { if( (x>=xs.get(i)&&y>ys.get(i)) || (x>xs.get(i)&&y>=ys.get(i))) { //remove any included x's and y's xs.remove(i); ys.remove(i); }else if(x<=xs.get(i) && y<=ys.get(i))//if included in others, stop { toadd=false; break; } else i++; } if(toadd) { xs.add(x); ys.add(y); } } //sort by x TreeMap<Long, Long> map=new TreeMap<Long, Long>(); for(int i=0; i<xs.size(); i++) map.put(xs.get(i), ys.get(i)); numReduceGroups=0; //compute area long prev=0; for(Entry<Long, Long> e: map.entrySet()) { numReduceGroups+=(e.getKey()-prev)*e.getValue(); prev=e.getKey(); } } MRInstruction[] insReducer = MRInstructionParser.parseMixedInstructions(otherInstructionsInReducer); if(insReducer!=null) { for(MRInstruction ins: insReducer) { MatrixCharacteristics.computeDimension(dims, ins); if( ins instanceof UnaryMRInstructionBase ) { UnaryMRInstructionBase tempIns=(UnaryMRInstructionBase) ins; setIntermediateMatrixCharactristics(job, tempIns.input, dims.get(tempIns.input)); intermediateMatrixIndexes.add(tempIns.input); } else if( ins instanceof RemoveEmptyMRInstruction ) { RemoveEmptyMRInstruction tempIns = (RemoveEmptyMRInstruction) ins; setIntermediateMatrixCharactristics(job, tempIns.input1, dims.get(tempIns.input1)); intermediateMatrixIndexes.add(tempIns.input1); } // if instruction's output is not in resultIndexes, then add its dimensions to jobconf boolean found = false; for(byte b : resultIndexes) { if(b==ins.output) { found = true; break; } } if(!found) { setIntermediateMatrixCharactristics(job, ins.output, dims.get(ins.output)); intermediateMatrixIndexes.add(ins.output); } } } setIntermediateMatrixIndexes(job, intermediateMatrixIndexes); for (byte tag : mapOutputIndexes) setMatrixCharactristicsForMapperOutput(job, tag, dims.get(tag)); MatrixCharacteristics[] stats=new MatrixCharacteristics[resultIndexes.length]; MatrixCharacteristics resultDims; for(int i=0; i<resultIndexes.length; i++) { resultDims = dims.get(resultIndexes[i]); stats[i]=resultDims; setMatrixCharactristicsForOutput(job, resultIndexes[i], stats[i]); } return new MatrixChar_N_ReducerGroups(stats, numReduceGroups); } public static void setIntermediateMatrixCharactristics(JobConf job, byte tag, MatrixCharacteristics dim) { job.setLong(INTERMEDIATE_MATRIX_NUM_ROW_PREFIX_CONFIG+tag, dim.getRows()); job.setLong(INTERMEDIATE_MATRIX_NUM_COLUMN_PREFIX_CONFIG+tag, dim.getCols()); job.setInt(INTERMEDIATE_BLOCK_NUM_ROW_PREFIX_CONFIG+tag, dim.getRowsPerBlock()); job.setInt(INTERMEDIATE_BLOCK_NUM_COLUMN_PREFIX_CONFIG+tag, dim.getColsPerBlock()); } public static MatrixCharacteristics getIntermediateMatrixCharactristics(JobConf job, byte tag) { MatrixCharacteristics dim=new MatrixCharacteristics(); dim.setDimension( job.getLong(INTERMEDIATE_MATRIX_NUM_ROW_PREFIX_CONFIG+tag, 0), job.getLong(INTERMEDIATE_MATRIX_NUM_COLUMN_PREFIX_CONFIG+tag, 0) ); dim.setBlockSize( job.getInt(INTERMEDIATE_BLOCK_NUM_ROW_PREFIX_CONFIG+tag, 1), job.getInt(INTERMEDIATE_BLOCK_NUM_COLUMN_PREFIX_CONFIG+tag, 1) ); return dim; } public static void setMatrixCharactristicsForOutput(JobConf job, byte tag, MatrixCharacteristics dim) { job.setLong(OUTPUT_MATRIX_NUM_ROW_PREFIX_CONFIG+tag, dim.getRows()); job.setLong(OUTPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG+tag, dim.getCols()); job.setInt(OUTPUT_BLOCK_NUM_ROW_PREFIX_CONFIG+tag, dim.getRowsPerBlock()); job.setInt(OUTPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG+tag, dim.getColsPerBlock()); } public static MatrixCharacteristics getMatrixCharacteristicsForOutput(JobConf job, byte tag) { MatrixCharacteristics dim=new MatrixCharacteristics(); dim.setDimension( job.getLong(OUTPUT_MATRIX_NUM_ROW_PREFIX_CONFIG+tag, 0), job.getLong(OUTPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG+tag, 0) ); dim.setBlockSize( job.getInt(OUTPUT_BLOCK_NUM_ROW_PREFIX_CONFIG+tag, 1), job.getInt(OUTPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG+tag, 1) ); return dim; } public static MatrixCharacteristics getMatrixCharacteristicsForInput(JobConf job, byte tag) { MatrixCharacteristics dim=new MatrixCharacteristics(); dim.setDimension( job.getLong(INPUT_MATRIX_NUM_ROW_PREFIX_CONFIG+tag, 0), job.getLong(INPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG+tag, 0) ); dim.setBlockSize( job.getInt(INPUT_BLOCK_NUM_ROW_PREFIX_CONFIG+tag, 1), job.getInt(INPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG+tag, 1) ); return dim; } public static void setMatrixCharactristicsForMapperOutput(JobConf job, byte tag, MatrixCharacteristics dim) { job.setLong(MAPOUTPUT_MATRIX_NUM_ROW_PREFIX_CONFIG+tag, dim.getRows()); job.setLong(MAPOUTPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG+tag, dim.getCols()); job.setInt(MAPOUTPUT_BLOCK_NUM_ROW_PREFIX_CONFIG+tag, dim.getRowsPerBlock()); job.setInt(MAPOUTPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG+tag, dim.getColsPerBlock()); } public static MatrixCharacteristics getMatrixCharacteristicsForMapOutput(JobConf job, byte tag) { MatrixCharacteristics dim=new MatrixCharacteristics(); dim.setDimension( job.getLong(MAPOUTPUT_MATRIX_NUM_ROW_PREFIX_CONFIG+tag, 0), job.getLong(MAPOUTPUT_MATRIX_NUM_COLUMN_PREFIX_CONFIG+tag, 0) ); dim.setBlockSize( job.getInt(MAPOUTPUT_BLOCK_NUM_ROW_PREFIX_CONFIG+tag, 1), job.getInt(MAPOUTPUT_BLOCK_NUM_COLUMN_PREFIX_CONFIG+tag, 1) ); return dim; } public static void setMatrixCharactristicsForReblock(JobConf job, byte tag, MatrixCharacteristics dim) { job.setLong(REBLOCK_MATRIX_NUM_ROW_PREFIX_CONFIG+tag, dim.getRows()); job.setLong(REBLOCK_MATRIX_NUM_COLUMN_PREFIX_CONFIG+tag, dim.getCols()); job.setInt(REBLOCK_BLOCK_NUM_ROW_PREFIX_CONFIG+tag, dim.getRowsPerBlock()); job.setInt(REBLOCK_BLOCK_NUM_COLUMN_PREFIX_CONFIG+tag, dim.getColsPerBlock()); job.setLong(REBLOCK_MATRIX_NUM_NNZ_PREFIX_CONFIG+tag, dim.getNonZeros()); } public static MatrixCharacteristics getMatrixCharactristicsForReblock(JobConf job, byte tag) { MatrixCharacteristics dim=new MatrixCharacteristics(); dim.setDimension( job.getLong(REBLOCK_MATRIX_NUM_ROW_PREFIX_CONFIG+tag, 0), job.getLong(REBLOCK_MATRIX_NUM_COLUMN_PREFIX_CONFIG+tag, 0) ); dim.setBlockSize( job.getInt(REBLOCK_BLOCK_NUM_ROW_PREFIX_CONFIG+tag, 1), job.getInt(REBLOCK_BLOCK_NUM_COLUMN_PREFIX_CONFIG+tag, 1)); long nnz = job.getLong(REBLOCK_MATRIX_NUM_NNZ_PREFIX_CONFIG+tag, -1); if( nnz>=0 ) dim.setNonZeros( nnz ); return dim; } public static void setMatrixCharactristicsForBinAgg(JobConf job, byte tag, MatrixCharacteristics dim) { job.setLong(AGGBIN_MATRIX_NUM_ROW_PREFIX_CONFIG+tag, dim.getRows()); job.setLong(AGGBIN_MATRIX_NUM_COLUMN_PREFIX_CONFIG+tag, dim.getCols()); job.setInt(AGGBIN_BLOCK_NUM_ROW_PREFIX_CONFIG+tag, dim.getRowsPerBlock()); job.setInt(AGGBIN_BLOCK_NUM_COLUMN_PREFIX_CONFIG+tag, dim.getColsPerBlock()); } public static MatrixCharacteristics getMatrixCharactristicsForBinAgg(JobConf job, byte tag) { MatrixCharacteristics dim=new MatrixCharacteristics(); dim.setDimension( job.getLong(AGGBIN_MATRIX_NUM_ROW_PREFIX_CONFIG+tag, 0), job.getLong(AGGBIN_MATRIX_NUM_COLUMN_PREFIX_CONFIG+tag, 0) ); dim.setBlockSize( job.getInt(AGGBIN_BLOCK_NUM_ROW_PREFIX_CONFIG+tag, 1), job.getInt(AGGBIN_BLOCK_NUM_COLUMN_PREFIX_CONFIG+tag, 1) ); return dim; } public static HashSet<Byte> setUpOutputIndexesForMapper(JobConf job, byte[] inputIndexes, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, byte[] resultIndexes) throws DMLRuntimeException { return setUpOutputIndexesForMapper(job, inputIndexes, null, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes); } public static HashSet<Byte> setUpOutputIndexesForMapper(JobConf job, byte[] inputIndexes, String instructionsInMapper, String reblockInstructions, String aggInstructionsInReducer, String otherInstructionsInReducer, byte[] resultIndexes) throws DMLRuntimeException { return setUpOutputIndexesForMapper(job, inputIndexes, null, instructionsInMapper, reblockInstructions, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes); } public static HashSet<Byte> setUpOutputIndexesForMapper(JobConf job, byte[] inputIndexes, String randInstructions, String instructionsInMapper, String reblockInstructions, String aggInstructionsInReducer, String otherInstructionsInReducer, byte[] resultIndexes) throws DMLRuntimeException { //find out what results are needed to send to reducers HashSet<Byte> indexesInMapper=new HashSet<Byte>(); for(byte b: inputIndexes) indexesInMapper.add(b); DataGenMRInstruction[] dataGenIns = null; dataGenIns = MRInstructionParser.parseDataGenInstructions(randInstructions); getIndexes(dataGenIns, indexesInMapper); MRInstruction[] insMapper = MRInstructionParser.parseMixedInstructions(instructionsInMapper); getIndexes(insMapper, indexesInMapper); ReblockInstruction[] reblockIns = null; reblockIns = MRInstructionParser.parseReblockInstructions(reblockInstructions); getIndexes(reblockIns, indexesInMapper); MRInstruction[] insReducer = MRInstructionParser.parseAggregateInstructions(aggInstructionsInReducer); HashSet<Byte> indexesInReducer=new HashSet<Byte>(); getIndexes(insReducer, indexesInReducer); insReducer = MRInstructionParser.parseMixedInstructions(otherInstructionsInReducer); getIndexes(insReducer, indexesInReducer); for(byte ind: resultIndexes) indexesInReducer.add(ind); indexesInMapper.retainAll(indexesInReducer); job.set(OUTPUT_INDEXES_IN_MAPPER_CONFIG, getIndexesString(indexesInMapper)); return indexesInMapper; } public static CollectMultipleConvertedOutputs getMultipleConvertedOutputs(JobConf job) { byte[] resultIndexes=MRJobConfiguration.getResultIndexes(job); Converter[] outputConverters=new Converter[resultIndexes.length]; MatrixCharacteristics[] stats=new MatrixCharacteristics[resultIndexes.length]; HashMap<Byte, ArrayList<Integer>> tagMapping=new HashMap<Byte, ArrayList<Integer>>(); for(int i=0; i<resultIndexes.length; i++) { byte output=resultIndexes[i]; ArrayList<Integer> vec=tagMapping.get(output); if(vec==null) { vec=new ArrayList<Integer>(); tagMapping.put(output, vec); } vec.add(i); outputConverters[i]=getOuputConverter(job, i); stats[i]=MRJobConfiguration.getMatrixCharacteristicsForOutput(job, output); } MultipleOutputs multipleOutputs=new MultipleOutputs(job); return new CollectMultipleConvertedOutputs(outputConverters, stats, multipleOutputs); } private static void getIndexes(MRInstruction[] instructions, HashSet<Byte> indexes) throws DMLRuntimeException { if(instructions==null) return; for(MRInstruction ins: instructions) { for(byte i: ins.getAllIndexes()) indexes.add(i); } } private static String getIndexesString(HashSet<Byte> indexes) { if(indexes==null || indexes.isEmpty()) return ""; StringBuilder sb = new StringBuilder(); for(Byte ind: indexes) { sb.append(ind); sb.append(Instruction.INSTRUCTION_DELIM); } //return string without last character return sb.substring(0, sb.length()-1); } private static String getIndexesString(byte[] indexes) { if(indexes==null || indexes.length==0) return ""; StringBuilder sb = new StringBuilder(); for(Byte ind: indexes) { sb.append(ind); sb.append(Instruction.INSTRUCTION_DELIM); } //return string without last character return sb.substring(0, sb.length()-1); } public static void setMapFunctionInputMatrixIndexes(JobConf job, byte[] realIndexes) { job.set(MAPFUNC_INPUT_MATRICIES_INDEXES_CONFIG, getIndexesString(realIndexes)); } public static boolean deriveRepresentation(InputInfo[] inputInfos) { for(InputInfo input: inputInfos) { if(!(input.inputValueClass==MatrixBlock.class)) { return false; } } return true; } public static String constructTempOutputFilename() { StringBuilder sb = new StringBuilder(); sb.append(ConfigurationManager.getScratchSpace()); sb.append(Lop.FILE_SEPARATOR); sb.append(Lop.PROCESS_PREFIX); sb.append(DMLScript.getUUID()); sb.append(Lop.FILE_SEPARATOR); sb.append("TmpOutput"+seq.getNextID()); //old unique dir (no guarantees): //sb.append(Integer.toHexString(new Random().nextInt(Integer.MAX_VALUE))); return sb.toString(); } private static String constructPartitionFilename() { StringBuilder sb = new StringBuilder(); sb.append(ConfigurationManager.getScratchSpace()); sb.append(Lop.FILE_SEPARATOR); sb.append(Lop.PROCESS_PREFIX); sb.append(DMLScript.getUUID()); sb.append(Lop.FILE_SEPARATOR); sb.append(SamplingSortMRInputFormat.PARTITION_FILENAME+seq.getNextID()); //old unique dir (no guarantees): //sb.append(Integer.toHexString(new Random().nextInt(Integer.MAX_VALUE))); return sb.toString(); } public static void setSystemMLLocalTmpDir(JobConf job, String dir) { job.set(SYSTEMML_LOCAL_TMP_DIR, dir); } public static String getSystemMLLocalTmpDir(JobConf job) { return job.get(SYSTEMML_LOCAL_TMP_DIR); } public static void addBinaryBlockSerializationFramework( Configuration job ) { String frameworkList = job.get(MRConfigurationNames.IO_SERIALIZATIONS); String frameworkClassBB = BinaryBlockSerialization.class.getCanonicalName(); job.set(MRConfigurationNames.IO_SERIALIZATIONS, frameworkClassBB+","+frameworkList); } /** * Set all configurations with prefix mapred or mapreduce that exist in the given * DMLConfig into the given JobConf. * * @param job job configuration * @param config dml configuration */ public static void setupCustomMRConfigurations( JobConf job, DMLConfig config ) { Map<String,String> map = config.getCustomMRConfig(); for( Entry<String,String> e : map.entrySet() ) { job.set(e.getKey(), e.getValue()); } } }