GMRMapper.java example

Explorer
incubator-systemml-master
- dev
  - release
    - src
      - test
        java
        org
        apache
        sysml
        validation
        Constants.java
        Utility.java
        ValidateLicAndNotice.java
- src
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */


package org.apache.sysml.runtime.matrix.mapred;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;

import org.apache.sysml.runtime.matrix.data.MatrixCell;
import org.apache.sysml.runtime.matrix.data.MatrixPackedCell;
import org.apache.sysml.runtime.matrix.data.MatrixValue;
import org.apache.sysml.runtime.matrix.data.TaggedMatrixPackedCell;
import org.apache.sysml.runtime.matrix.data.TaggedMatrixValue;
import org.apache.sysml.runtime.util.MapReduceTool;

public class GMRMapper extends MapperBase 
implements Mapper<Writable, Writable, Writable, Writable>
{
	
	//whether this is a map only job
	private boolean mapOnlyJob=false;
	
	//the final result indexes that needed to be outputted for maponly job
	protected byte[] resultIndexes=null;
	protected byte[] resultDimsUnknown=null;

	//output converters for maponly job
	protected CollectMultipleConvertedOutputs collectFinalMultipleOutputs;

	//the counters to record how many nonZero cells have been produced for each output
	// for maponly job
	protected long[] resultsNonZeros=null;
	protected long[] resultsMaxRowDims=null;
	protected long[] resultsMaxColDims=null;
	protected String dimsUnknownFilePrefix;
	
	//cached reporter to report the number of nonZeros for each reduce task
	protected Reporter cachedReporter=null;
	
	protected String mapperID;
	
	//tempory variables
	private TaggedMatrixValue taggedValueBuffer=null;
	private HashMap<Byte, ArrayList<Integer>> tagMapping;
	
	//empty block filter flags
	private boolean _filterEmptyInputBlocks = false;
	
	@Override
	public void map(Writable rawKey, Writable rawValue, OutputCollector<Writable, Writable> out, Reporter reporter) 
		throws IOException 
	{
		//cache reporter for counters in close 
		cachedReporter = reporter;

		//empty block input filter
		if( _filterEmptyInputBlocks && ((MatrixValue)rawValue).isEmpty() )
			return;
		
		//default map runtime (input converters, call to overwritten special operations)
		commonMap(rawKey, rawValue, out, reporter);
	}
	
	@Override
	protected void specialOperationsForActualMap(int index, OutputCollector<Writable, Writable> out, Reporter reporter) 
		throws IOException 
	{		
		//apply all instructions
		processMapperInstructionsForMatrix(index);
		
		//output the results needed by the reducer
		if(mapOnlyJob)
			processMapFinalOutput(index, taggedValueBuffer, collectFinalMultipleOutputs, reporter, tagMapping);
		else
			processMapOutputToReducerForGMR(index, taggedValueBuffer, out);
	}

	protected void processMapOutputToReducerForGMR(int index, TaggedMatrixValue taggedValueBuffer, OutputCollector<Writable, Writable> out) 
		throws IOException
	{			
		for( byte output: outputIndexes.get(index) )
		{
			ArrayList<IndexedMatrixValue> results = cachedValues.get(output);
			if(results == null)
				continue;
			for(IndexedMatrixValue result : results)
			{
				if(result == null)
					continue;
				
				//prepare tagged output value
				//(special case for conversion from matrixcell to taggedmatrixpackedcell, e.g., ctable)
				if(valueClass.equals(MatrixCell.class))
					taggedValueBuffer.getBaseObject().copy(result.getValue());
				else
					taggedValueBuffer.setBaseObject(result.getValue());
				taggedValueBuffer.setTag(output);
				
				//collect output (exactly once)
				out.collect( result.getIndexes(), taggedValueBuffer);
			}
		}	
	}

	protected void processMapFinalOutput(int index,
			TaggedMatrixValue taggedValueBuffer, CollectMultipleConvertedOutputs collectFinalMultipleOutputs,
			Reporter reporter, HashMap<Byte, ArrayList<Integer>> tagMapping) throws IOException
	{
		for(byte output: outputIndexes.get(index))
		{
			ArrayList<IndexedMatrixValue> results= cachedValues.get(output);
			if(results==null)
				continue;
			for(IndexedMatrixValue result: results)
			{
				if(result==null)
					continue;
				
				//prepare tagged output value
				taggedValueBuffer.setBaseObject(result.getValue());
				taggedValueBuffer.setTag(output);
				
				//collect output (for all result indexes)				
				for( int outputIndex: tagMapping.get(output) )
				{
					collectOutput_N_Increase_Counter(
							result.getIndexes(), taggedValueBuffer.getBaseObject(), 
							outputIndex, reporter, collectFinalMultipleOutputs, 
							resultDimsUnknown, resultsNonZeros, resultsMaxRowDims, resultsMaxColDims);
				}
			}
		}	
		
	}
	
	public void configure(JobConf job)
	{
		super.configure(job);
		
		mapperID = job.get(MRConfigurationNames.MR_TASK_ATTEMPT_ID);
		dimsUnknownFilePrefix = job.get("dims.unknown.file.prefix");
		
		_filterEmptyInputBlocks = allowsFilterEmptyInputBlocks();
		
		//assign the temporay vairables
		try {
		//	System.out.println(valueClass.getName());
		//	System.out.println(MatrixCell.class.getName());
			if(job.getMapOutputValueClass().equals(TaggedMatrixPackedCell.class))
				taggedValueBuffer=TaggedMatrixValue.createObject(MatrixPackedCell.class);
			else
				taggedValueBuffer=TaggedMatrixValue.createObject(valueClass);		
		} catch (Exception e) {
			throw new RuntimeException(e);
		}
		
		//decide whether it is a maponly job
		mapOnlyJob=(job.getNumReduceTasks()<=0);
		if(!mapOnlyJob)
			return;
		
		//get the indexes of the final output matrices
		resultIndexes=MRJobConfiguration.getResultIndexes(job);
		resultDimsUnknown = MRJobConfiguration.getResultDimsUnknown(job);
		
		//initialize SystemML Counters (defined in MRJobConfiguration)
		resultsNonZeros=new long[resultIndexes.length];
		resultsMaxRowDims=new long[resultIndexes.length];
		resultsMaxColDims=new long[resultIndexes.length];
		
		tagMapping=new HashMap<Byte, ArrayList<Integer>>();
		for(int i=0; i<resultIndexes.length; i++)
		{
			byte output=resultIndexes[i];
			ArrayList<Integer> vec=tagMapping.get(output);
			if(vec==null)
			{
				vec=new ArrayList<Integer>();
				tagMapping.put(output, vec);
			}
			vec.add(i);
		}
		//for map only job, get the map output converters 
		collectFinalMultipleOutputs=MRJobConfiguration.getMultipleConvertedOutputs(job);
	}
	
	public void close() throws IOException
	{
		if( cachedReporter!=null && mapOnlyJob )
		{
			//get and construct task id
			String[] parts = mapperID.split("_");
			String jobID = "job_" + parts[1] + "_" + parts[2];
			int taskid;
			if ( parts[0].equalsIgnoreCase("task")) {
				taskid = Integer.parseInt(parts[parts.length-1]);
			}
			else if ( parts[0].equalsIgnoreCase("attempt")) {
				taskid = Integer.parseInt(parts[parts.length-2]);
			}
			else {
				throw new RuntimeException("Unrecognized format for reducerID: " + mapperID);
			}
			
			//maintain unknown dimensions (if required, e.g., ctable)
			boolean dimsUnknown = false;
			for(int i=0; i<resultIndexes.length; i++) {
				cachedReporter.incrCounter(MRJobConfiguration.NUM_NONZERO_CELLS, Integer.toString(i), resultsNonZeros[i]);
				
				if ( resultDimsUnknown!=null && resultDimsUnknown[i] != (byte) 0 ) {
					dimsUnknown = true;
					// Each counter is of the form: (group, name)
					// where group = max_rowdim_resultindex; name = taskid
					//System.out.println("--> before i="+i+", row = " + cachedReporter.getCounter("max_rowdim_"+i, ""+taskid).getCounter() + ", col = " + cachedReporter.getCounter("max_coldim_"+i, ""+taskid).getCounter());
					//cachedReporter.getCounter(MRJobConfiguration.MAX_ROW_DIMENSION, Integer.toString(i)).increment(resultsMaxRowDims[i]);
					//cachedReporter.getCounter(MRJobConfiguration.MAX_COL_DIMENSION, Integer.toString(i)).increment(resultsMaxColDims[i]);
					//System.out.println("--> after i="+i+", row = " + cachedReporter.getCounter("max_rowdim_"+i, ""+taskid).getCounter() + ", col = " + cachedReporter.getCounter("max_coldim_"+i, ""+taskid).getCounter());
				}
			}
			if ( dimsUnknown ) {
				// every task creates a file with max_row and max_col dimensions found in that task
				MapReduceTool.writeDimsFile(dimsUnknownFilePrefix + "/" + jobID + "_dimsFile/" + "m_" + taskid , resultDimsUnknown, resultsMaxRowDims, resultsMaxColDims);
			}
		}
		
		if(collectFinalMultipleOutputs!=null)
			collectFinalMultipleOutputs.close();
	}
}