DummycodeAgent.java example

Explorer
systemml-master
- system-ml
  - src
/**
 * (C) Copyright IBM Corp. 2010, 2015
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 */

package com.ibm.bi.dml.runtime.transform;

import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.wink.json4j.JSONArray;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;

import com.google.common.base.Functions;
import com.google.common.collect.Ordering;
import com.ibm.bi.dml.runtime.util.UtilFunctions;

public class DummycodeAgent extends TransformationAgent {	
	
	private static final long serialVersionUID = 5832130477659116489L;

	private int[] _dcdList = null;
	private long numCols = 0;
	
	private HashMap<Integer, HashMap<String,String>> _finalMaps = null;
	private HashMap<Integer, HashMap<String,Long>> _finalMapsCP = null;
	private int[] _binList = null;
	private int[] _numBins = null;
	
	private int[] _domainSizes = null;			// length = #of dummycoded columns
	private int[] _dcdColumnMap = null;			// to help in translating between original and dummycoded column IDs
	private long _dummycodedLength = 0;			// #of columns after dummycoded
	
	DummycodeAgent(int[] list) {
		_dcdList = list;
	}
	
	DummycodeAgent(JSONObject parsedSpec, long ncol) throws JSONException {
		numCols = ncol;
		
		if ( !parsedSpec.containsKey(TX_METHOD.DUMMYCODE.toString()) )
			return;
		
		JSONObject obj = (JSONObject) parsedSpec.get(TX_METHOD.DUMMYCODE.toString());
		JSONArray attrs = (JSONArray) obj.get(JSON_ATTRS);
			
		_dcdList = new int[attrs.size()];
		for(int i=0; i < _dcdList.length; i++) 
			_dcdList[i] = UtilFunctions.toInt(attrs.get(i));
	}
	
	public int[] dcdList() {
		return _dcdList;
	}
	
	/**
	 * Method to output transformation metadata from the mappers. 
	 * This information is collected and merged by the reducers.
	 * 
	 * @param out
	 * @throws IOException
	 * 
	 */
	@Override
	public void mapOutputTransformationMetadata(OutputCollector<IntWritable, DistinctValue> out, int taskID, TfUtils agents) throws IOException {
		// There is no metadata required for dummycode.
		// Required information is output from RecodeAgent.
		return;
	}
	
	@Override
	public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values,
			String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException {
		// Nothing to do here
	}

	public void setRecodeMaps(HashMap<Integer, HashMap<String,String>> maps) {
		_finalMaps = maps;
	}
	
	public void setRecodeMapsCP(HashMap<Integer, HashMap<String,Long>> maps) {
		_finalMapsCP = maps;
	}
	
	public void setNumBins(int[] binList, int[] numbins) {
		_binList = binList;
		_numBins = numbins;
	}
	
	/**
	 * Method to generate dummyCodedMaps.csv, with the range of column IDs for each variable in the original data.
	 * 
	 * Each line in dummyCodedMaps.csv file is of the form: [ColID, 1/0, st, end]
	 * 		1/0 indicates if ColID is dummycoded or not
	 * 		[st,end] is the range of dummycoded column numbers for the given ColID
	 * 
	 * It also generates coltypes.csv, with the type (scale, nominal, etc.) of columns in the output.
	 * Recoded columns are of type nominal, binner columns are of type ordinal, dummycoded columns are of type 
	 * dummycoded, and the remaining are of type scale.
	 * 
	 * @param fs
	 * @param txMtdDir
	 * @param numCols
	 * @param ra
	 * @param ba
	 * @return Number of columns in the transformed data
	 * @throws IOException
	 */
	public int genDcdMapsAndColTypes(FileSystem fs, String txMtdDir, int numCols, TfUtils agents) throws IOException {
		
		// initialize all column types in the transformed data to SCALE
		ColumnTypes[] ctypes = new ColumnTypes[(int) _dummycodedLength];
		for(int i=0; i < _dummycodedLength; i++)
			ctypes[i] = ColumnTypes.SCALE;
		
		_dcdColumnMap = new int[numCols];

		Path pt=new Path(txMtdDir+"/Dummycode/" + DCD_FILE_NAME);
		BufferedWriter br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true)));
		
		int sum=1;
		int idx = 0;
		for(int colID=1; colID <= numCols; colID++) 
		{
			if ( _dcdList != null && idx < _dcdList.length && _dcdList[idx] == colID )
			{
				br.write(colID + "," + "1" + "," + sum + "," + (sum+_domainSizes[idx]-1) + "\n");
				_dcdColumnMap[colID-1] = (sum+_domainSizes[idx]-1)-1;

				for(int i=sum; i <=(sum+_domainSizes[idx]-1); i++)
					ctypes[i-1] = ColumnTypes.DUMMYCODED;
				
				sum += _domainSizes[idx];
				idx++;
			}
			else 
			{
				br.write(colID + "," + "0" + "," + sum + "," + sum + "\n");
				_dcdColumnMap[colID-1] = sum-1;
				
				if ( agents.getBinAgent().isBinned(colID) != -1 )
					ctypes[sum-1] = ColumnTypes.ORDINAL;	// binned variable results in an ordinal column
				
				if ( agents.getRecodeAgent().isRecoded(colID) != -1 )
					ctypes[sum-1] = ColumnTypes.NOMINAL;
				
				sum += 1;
			}
		}
		br.close();

		// Write coltypes.csv
		pt=new Path(txMtdDir+"/" + COLTYPES_FILE_NAME);
		br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true)));
		
		br.write(columnTypeToID(ctypes[0]) + "");
		for(int i = 1; i < _dummycodedLength; i++) 
			br.write( "," + columnTypeToID(ctypes[i]));
		br.close();
		
		return sum-1;
	}
	
	/**
	 * Given a dummycoded column id, find the corresponding original column ID.
	 *  
	 * @param colID
	 * @return
	 */
	public int mapDcdColumnID(int colID) 
	{
		for(int i=0; i < _dcdColumnMap.length; i++)
		{
			int st = (i==0 ? 1 : _dcdColumnMap[i-1]+1+1);
			int end = _dcdColumnMap[i]+1;
			//System.out.println((i+1) + ": " + "[" + st + "," + end + "]");
			
			if ( colID >= st && colID <= end)
				return i+1;
		}
		return -1;
	}
	
	public String constructDummycodedHeader(String header, Pattern delim) {
		
		if(_dcdList == null && _binList == null )
			// none of the columns are dummycoded, simply return the given header
			return header;
		
		String[] names = delim.split(header, -1);
		List<String> newNames = null;
		
		StringBuilder sb = new StringBuilder();
		
		// Dummycoding can be performed on either on a recoded column or on a binned column
		
		// process recoded columns
		if(_finalMapsCP != null && _dcdList != null) 
		{
			for(int i=0; i <_dcdList.length; i++) 
			{
				int colID = _dcdList[i];
				HashMap<String,Long> map = _finalMapsCP.get(colID);
				String colName = UtilFunctions.unquote(names[colID-1]);
				
				if ( map != null  ) 
				{
					// order map entries by their recodeID
					Ordering<String> valueComparator = Ordering.natural().onResultOf(Functions.forMap(map));
					newNames = valueComparator.sortedCopy(map.keySet());
					
					// construct concatenated string of map entries
					sb.setLength(0);
					for(int idx=0; idx < newNames.size(); idx++) 
					{
						if(idx==0) 
							sb.append( colName + DCD_NAME_SEP + newNames.get(idx));
						else
							sb.append( delim + colName + DCD_NAME_SEP + newNames.get(idx));
					}
					names[colID-1] = sb.toString();			// replace original column name with dcd name
				}
			}
		}
		else if(_finalMaps != null && _dcdList != null) {
			for(int i=0; i <_dcdList.length; i++) {
				int colID = _dcdList[i];
				HashMap<String,String> map = _finalMaps.get(colID);
				String colName = UtilFunctions.unquote(names[colID-1]);
				
				if ( map != null ) 
				{
					// order map entries by their recodeID (represented as Strings .. "1", "2", etc.)
					Ordering<String> orderByID = new Ordering<String>() 
					{
			    		public int compare(String s1, String s2) {
			        		return (Integer.parseInt(s1) - Integer.parseInt(s2));
			    		}
					};
					
					newNames = orderByID.onResultOf(Functions.forMap(map)).sortedCopy(map.keySet());
					// construct concatenated string of map entries
					sb.setLength(0);
					for(int idx=0; idx < newNames.size(); idx++) 
					{
						if(idx==0) 
							sb.append( colName + DCD_NAME_SEP + newNames.get(idx));
						else
							sb.append( delim + colName + DCD_NAME_SEP + newNames.get(idx));
					}
					names[colID-1] = sb.toString();			// replace original column name with dcd name
				}
			}
		}
		
		// process binned columns
		if (_binList != null) 
			for(int i=0; i < _binList.length; i++) 
			{
				int colID = _binList[i];
				
				// need to consider only binned and dummycoded columns
				if(isDummyCoded(colID) == -1)
					continue;
				
				int numBins = _numBins[i];
				String colName = UtilFunctions.unquote(names[colID-1]);
				
				sb.setLength(0);
				for(int idx=0; idx < numBins; idx++) 
					if(idx==0) 
						sb.append( colName + DCD_NAME_SEP + "Bin" + (idx+1) );
					else
						sb.append( delim + colName + DCD_NAME_SEP + "Bin" + (idx+1) );
				names[colID-1] = sb.toString();			// replace original column name with dcd name
			}
		
		// Construct the full header
		sb.setLength(0);
		for(int colID=0; colID < names.length; colID++) 
		{
			if (colID == 0)
				sb.append(names[colID]);
			else
				sb.append(delim + names[colID]);
		}
		//System.out.println("DummycodedHeader: " + sb.toString());
		
		return sb.toString();
	}
	
	@Override
	public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException {
		if ( _dcdList == null )
		{
			_dummycodedLength = numCols;
			return;
		}
		
		// sort to-be dummycoded column IDs in ascending order. This is the order in which the new dummycoded record is constructed in apply() function.
		Arrays.sort(_dcdList);	
		_domainSizes = new int[_dcdList.length];

		_dummycodedLength = numCols;
		
		//HashMap<String, String> map = null;
		for(int i=0; i<_dcdList.length; i++) {
			int colID = _dcdList[i];
			
			// Find the domain size for colID using _finalMaps or _finalMapsCP
			int domainSize = 0;
			if(_finalMaps != null) {
				if(_finalMaps.get(colID) != null)
					domainSize = _finalMaps.get(colID).size();
			}
			else {
				if(_finalMapsCP.get(colID) != null)
					domainSize = _finalMapsCP.get(colID).size();
			}
			
			if ( domainSize != 0 ) {
				// dummycoded column
				_domainSizes[i] = domainSize;
			}
			else {
				// binned column
				if ( _binList != null )
				for(int j=0; j<_binList.length; j++) {
					if (colID == _binList[j]) {
						_domainSizes[i] = _numBins[j];
						break;
					}
				}
			}
			_dummycodedLength += _domainSizes[i]-1;
			//System.out.println("colID=" + colID + ", domainsize=" + _domainSizes[i] + ", dcdLength=" + _dummycodedLength);
		}
	}

	/**
	 * Method to apply transformations.
	 * 
	 * @param words
	 * @return
	 */
	@Override
	public String[] apply(String[] words, TfUtils agents) {
		
		if ( _dcdList == null )
			return words;
		
		String[] nwords = new String[(int)_dummycodedLength];
		
		int rcdVal = 0;
		
		for(int colID=1, idx=0, ncolID=1; colID <= words.length; colID++) {
			if(idx < _dcdList.length && colID==_dcdList[idx]) {
				// dummycoded columns
				try {
				rcdVal = UtilFunctions.parseToInt(UtilFunctions.unquote(words[colID-1]));
				nwords[ ncolID-1+rcdVal-1 ] = "1";
				ncolID += _domainSizes[idx];
				idx++;
				} catch (Exception e) {
					System.out.println("Error in dummycoding: colID="+colID + ", rcdVal=" + rcdVal+", word="+words[colID-1] + ", domainSize=" + _domainSizes[idx] + ", dummyCodedLength=" + _dummycodedLength);
					throw new RuntimeException(e);
				}
			}
			else {
				nwords[ncolID-1] = words[colID-1];
				ncolID++;
			}
		}
		
		return nwords;
	}
	
	/**
	 * Check if the given column ID is subjected to this transformation.
	 * 
	 */
	public int isDummyCoded(int colID)
	{
		if(_dcdList == null)
			return -1;
		
		int idx = Arrays.binarySearch(_dcdList, colID);
		return ( idx >= 0 ? idx : -1);
	}
	
	@Override
	public void print() {
		System.out.print("Dummycoding List: \n    ");
		for(int i : _dcdList) {
			System.out.print(i + " ");
		}
		System.out.println();
	}

}