/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.matrix.mapred;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.sysml.runtime.matrix.data.MatrixCell;
import org.apache.sysml.runtime.matrix.data.MatrixPackedCell;
import org.apache.sysml.runtime.matrix.data.MatrixValue;
import org.apache.sysml.runtime.matrix.data.TaggedMatrixPackedCell;
import org.apache.sysml.runtime.matrix.data.TaggedMatrixValue;
import org.apache.sysml.runtime.util.MapReduceTool;
public class GMRMapper extends MapperBase
implements Mapper<Writable, Writable, Writable, Writable>
{
//whether this is a map only job
private boolean mapOnlyJob=false;
//the final result indexes that needed to be outputted for maponly job
protected byte[] resultIndexes=null;
protected byte[] resultDimsUnknown=null;
//output converters for maponly job
protected CollectMultipleConvertedOutputs collectFinalMultipleOutputs;
//the counters to record how many nonZero cells have been produced for each output
// for maponly job
protected long[] resultsNonZeros=null;
protected long[] resultsMaxRowDims=null;
protected long[] resultsMaxColDims=null;
protected String dimsUnknownFilePrefix;
//cached reporter to report the number of nonZeros for each reduce task
protected Reporter cachedReporter=null;
protected String mapperID;
//tempory variables
private TaggedMatrixValue taggedValueBuffer=null;
private HashMap<Byte, ArrayList<Integer>> tagMapping;
//empty block filter flags
private boolean _filterEmptyInputBlocks = false;
@Override
public void map(Writable rawKey, Writable rawValue, OutputCollector<Writable, Writable> out, Reporter reporter)
throws IOException
{
//cache reporter for counters in close
cachedReporter = reporter;
//empty block input filter
if( _filterEmptyInputBlocks && ((MatrixValue)rawValue).isEmpty() )
return;
//default map runtime (input converters, call to overwritten special operations)
commonMap(rawKey, rawValue, out, reporter);
}
@Override
protected void specialOperationsForActualMap(int index, OutputCollector<Writable, Writable> out, Reporter reporter)
throws IOException
{
//apply all instructions
processMapperInstructionsForMatrix(index);
//output the results needed by the reducer
if(mapOnlyJob)
processMapFinalOutput(index, taggedValueBuffer, collectFinalMultipleOutputs, reporter, tagMapping);
else
processMapOutputToReducerForGMR(index, taggedValueBuffer, out);
}
protected void processMapOutputToReducerForGMR(int index, TaggedMatrixValue taggedValueBuffer, OutputCollector<Writable, Writable> out)
throws IOException
{
for( byte output: outputIndexes.get(index) )
{
ArrayList<IndexedMatrixValue> results = cachedValues.get(output);
if(results == null)
continue;
for(IndexedMatrixValue result : results)
{
if(result == null)
continue;
//prepare tagged output value
//(special case for conversion from matrixcell to taggedmatrixpackedcell, e.g., ctable)
if(valueClass.equals(MatrixCell.class))
taggedValueBuffer.getBaseObject().copy(result.getValue());
else
taggedValueBuffer.setBaseObject(result.getValue());
taggedValueBuffer.setTag(output);
//collect output (exactly once)
out.collect( result.getIndexes(), taggedValueBuffer);
}
}
}
protected void processMapFinalOutput(int index,
TaggedMatrixValue taggedValueBuffer, CollectMultipleConvertedOutputs collectFinalMultipleOutputs,
Reporter reporter, HashMap<Byte, ArrayList<Integer>> tagMapping) throws IOException
{
for(byte output: outputIndexes.get(index))
{
ArrayList<IndexedMatrixValue> results= cachedValues.get(output);
if(results==null)
continue;
for(IndexedMatrixValue result: results)
{
if(result==null)
continue;
//prepare tagged output value
taggedValueBuffer.setBaseObject(result.getValue());
taggedValueBuffer.setTag(output);
//collect output (for all result indexes)
for( int outputIndex: tagMapping.get(output) )
{
collectOutput_N_Increase_Counter(
result.getIndexes(), taggedValueBuffer.getBaseObject(),
outputIndex, reporter, collectFinalMultipleOutputs,
resultDimsUnknown, resultsNonZeros, resultsMaxRowDims, resultsMaxColDims);
}
}
}
}
public void configure(JobConf job)
{
super.configure(job);
mapperID = job.get(MRConfigurationNames.MR_TASK_ATTEMPT_ID);
dimsUnknownFilePrefix = job.get("dims.unknown.file.prefix");
_filterEmptyInputBlocks = allowsFilterEmptyInputBlocks();
//assign the temporay vairables
try {
// System.out.println(valueClass.getName());
// System.out.println(MatrixCell.class.getName());
if(job.getMapOutputValueClass().equals(TaggedMatrixPackedCell.class))
taggedValueBuffer=TaggedMatrixValue.createObject(MatrixPackedCell.class);
else
taggedValueBuffer=TaggedMatrixValue.createObject(valueClass);
} catch (Exception e) {
throw new RuntimeException(e);
}
//decide whether it is a maponly job
mapOnlyJob=(job.getNumReduceTasks()<=0);
if(!mapOnlyJob)
return;
//get the indexes of the final output matrices
resultIndexes=MRJobConfiguration.getResultIndexes(job);
resultDimsUnknown = MRJobConfiguration.getResultDimsUnknown(job);
//initialize SystemML Counters (defined in MRJobConfiguration)
resultsNonZeros=new long[resultIndexes.length];
resultsMaxRowDims=new long[resultIndexes.length];
resultsMaxColDims=new long[resultIndexes.length];
tagMapping=new HashMap<Byte, ArrayList<Integer>>();
for(int i=0; i<resultIndexes.length; i++)
{
byte output=resultIndexes[i];
ArrayList<Integer> vec=tagMapping.get(output);
if(vec==null)
{
vec=new ArrayList<Integer>();
tagMapping.put(output, vec);
}
vec.add(i);
}
//for map only job, get the map output converters
collectFinalMultipleOutputs=MRJobConfiguration.getMultipleConvertedOutputs(job);
}
public void close() throws IOException
{
if( cachedReporter!=null && mapOnlyJob )
{
//get and construct task id
String[] parts = mapperID.split("_");
String jobID = "job_" + parts[1] + "_" + parts[2];
int taskid;
if ( parts[0].equalsIgnoreCase("task")) {
taskid = Integer.parseInt(parts[parts.length-1]);
}
else if ( parts[0].equalsIgnoreCase("attempt")) {
taskid = Integer.parseInt(parts[parts.length-2]);
}
else {
throw new RuntimeException("Unrecognized format for reducerID: " + mapperID);
}
//maintain unknown dimensions (if required, e.g., ctable)
boolean dimsUnknown = false;
for(int i=0; i<resultIndexes.length; i++) {
cachedReporter.incrCounter(MRJobConfiguration.NUM_NONZERO_CELLS, Integer.toString(i), resultsNonZeros[i]);
if ( resultDimsUnknown!=null && resultDimsUnknown[i] != (byte) 0 ) {
dimsUnknown = true;
// Each counter is of the form: (group, name)
// where group = max_rowdim_resultindex; name = taskid
//System.out.println("--> before i="+i+", row = " + cachedReporter.getCounter("max_rowdim_"+i, ""+taskid).getCounter() + ", col = " + cachedReporter.getCounter("max_coldim_"+i, ""+taskid).getCounter());
//cachedReporter.getCounter(MRJobConfiguration.MAX_ROW_DIMENSION, Integer.toString(i)).increment(resultsMaxRowDims[i]);
//cachedReporter.getCounter(MRJobConfiguration.MAX_COL_DIMENSION, Integer.toString(i)).increment(resultsMaxColDims[i]);
//System.out.println("--> after i="+i+", row = " + cachedReporter.getCounter("max_rowdim_"+i, ""+taskid).getCounter() + ", col = " + cachedReporter.getCounter("max_coldim_"+i, ""+taskid).getCounter());
}
}
if ( dimsUnknown ) {
// every task creates a file with max_row and max_col dimensions found in that task
MapReduceTool.writeDimsFile(dimsUnknownFilePrefix + "/" + jobID + "_dimsFile/" + "m_" + taskid , resultDimsUnknown, resultsMaxRowDims, resultsMaxColDims);
}
}
if(collectFinalMultipleOutputs!=null)
collectFinalMultipleOutputs.close();
}
}