/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.matrix;
import java.util.HashSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapred.Counters.Group;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.sysml.conf.ConfigurationManager;
import org.apache.sysml.conf.DMLConfig;
import org.apache.sysml.runtime.instructions.MRJobInstruction;
import org.apache.sysml.runtime.matrix.data.InputInfo;
import org.apache.sysml.runtime.matrix.data.OutputInfo;
import org.apache.sysml.runtime.matrix.data.TaggedMatrixBlock;
import org.apache.sysml.runtime.matrix.data.TaggedMatrixCell;
import org.apache.sysml.runtime.matrix.data.TripleIndexes;
import org.apache.sysml.runtime.matrix.mapred.MMRJMRMapper;
import org.apache.sysml.runtime.matrix.mapred.MMRJMRReducer;
import org.apache.sysml.runtime.matrix.mapred.MRConfigurationNames;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.ConvertTarget;
import org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.MatrixChar_N_ReducerGroups;
import org.apache.sysml.yarn.DMLAppMasterUtils;
/*
* inBlockRepresentation: indicate whether to use block representation or cell representation
* inputs: input matrices, the inputs are indexed by 0, 1, 2, .. based on the position in this string
* inputInfos: the input format information for the input matrices
* rlen: the number of rows for each matrix
* clen: the number of columns for each matrix
* brlen: the number of rows per block
* bclen: the number of columns per block
* instructionsInMapper: in Mapper, the set of unary operations that need to be performed on each input matrix
* aggInstructionsInReducer: in Reducer, right after sorting, the set of aggreagte operations that need
* to be performed on each input matrix,
* aggBinInstrction: the aggregate binary instruction for the MMCJ operation
* otherInstructionsInReducer: the mixed operations that need to be performed on matrices after the aggregate operations
* numReducers: the number of reducers
* replication: the replication factor for the output
* resulltIndexes: the indexes of the result matrices that needs to be outputted.
* outputs: the names for the output directories, one for each result index
* outputInfos: output format information for the output matrices
*/
public class MMRJMR
{
private static final Log LOG = LogFactory.getLog(MMRJMR.class.getName());
private MMRJMR() {
//prevent instantiation via private constructor
}
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos,
long[] rlens, long[] clens, int[] brlens, int[] bclens, String instructionsInMapper,
String aggInstructionsInReducer, String aggBinInstrctions, String otherInstructionsInReducer,
int numReducers, int replication, byte[] resultIndexes,
String[] outputs, OutputInfo[] outputInfos)
throws Exception
{
JobConf job = new JobConf(MMRJMR.class);
job.setJobName("MMRJ-MR");
if(numReducers<=0)
throw new Exception("MMRJ-MR has to have at least one reduce task!");
// TODO: check w/ yuanyuan. This job always runs in blocked mode, and hence derivation is not necessary.
boolean inBlockRepresentation=MRJobConfiguration.deriveRepresentation(inputInfos);
//whether use block representation or cell representation
MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);
byte[] realIndexes=new byte[inputs.length];
for(byte b=0; b<realIndexes.length; b++)
realIndexes[b]=b;
//set up the input files and their format information
MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens,
true, inBlockRepresentation? ConvertTarget.BLOCK: ConvertTarget.CELL);
//set up the dimensions of input matrices
MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
//set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
//set up unary instructions that will perform in the mapper
MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);
//set up the aggregate instructions that will happen in the combiner and reducer
MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);
//set up the aggregate binary operation for the mmcj job
MRJobConfiguration.setAggregateBinaryInstructions(job, aggBinInstrctions);
//set up the instructions that will happen in the reducer, after the aggregation instrucions
MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
//set up the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
//set up map/reduce memory configurations (if in AM context)
DMLConfig config = ConfigurationManager.getDMLConfig();
DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
//set up custom map/reduce configurations
MRJobConfiguration.setupCustomMRConfigurations(job, config);
// byte[] resultIndexes=new byte[]{AggregateBinaryInstruction.parseMRInstruction(aggBinInstrction).output};
//set up what matrices are needed to pass from the mapper to reducer
HashSet<Byte> mapoutputIndexes=MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, aggInstructionsInReducer,
aggBinInstrctions, resultIndexes );
MatrixChar_N_ReducerGroups ret=MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes,
instructionsInMapper, aggInstructionsInReducer, aggBinInstrctions, otherInstructionsInReducer,
resultIndexes, mapoutputIndexes, false);
MatrixCharacteristics[] stats=ret.stats;
//set up the number of reducers
MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
// Print the complete instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(stats);
byte[] dimsUnknown = new byte[resultIndexes.length];
for ( int i=0; i < resultIndexes.length; i++ ) {
if ( stats[i].getRows() == -1 || stats[i].getCols() == -1 ) {
dimsUnknown[i] = (byte)1;
}
else {
dimsUnknown[i] = (byte) 0;
}
}
//set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, dimsUnknown, outputs, outputInfos, inBlockRepresentation);
// configure mapper
job.setMapperClass(MMRJMRMapper.class);
job.setMapOutputKeyClass(TripleIndexes.class);
if(inBlockRepresentation)
job.setMapOutputValueClass(TaggedMatrixBlock.class);
else
job.setMapOutputValueClass(TaggedMatrixCell.class);
job.setOutputKeyComparatorClass(TripleIndexes.Comparator.class);
job.setPartitionerClass(TripleIndexes.FirstTwoIndexesPartitioner.class);
//configure combiner
//TODO: cannot set up combiner, because it will destroy the stable numerical algorithms
// for sum or for central moments
// if(aggInstructionsInReducer!=null && !aggInstructionsInReducer.isEmpty())
// job.setCombinerClass(MMCJMRCombiner.class);
//configure reducer
job.setReducerClass(MMRJMRReducer.class);
// By default, the job executes in "cluster" mode.
// Determine if we can optimize and run it in "local" mode.
MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
for ( int i=0; i < inputs.length; i++ ) {
inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
}
//set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
RunningJob runjob=JobClient.runJob(job);
/* Process different counters */
Group group=runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
for(int i=0; i<resultIndexes.length; i++) {
// number of non-zeros
stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
}
return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
}