DummycodeAgent.java example

Explorer
incubator-systemml-master
- dev
  - release
    - src
      - test
        java
        org
        apache
        sysml
        validation
        Constants.java
        Utility.java
        ValidateLicAndNotice.java
- src
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.sysml.runtime.transform;

import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.sysml.runtime.matrix.data.FrameBlock;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.transform.encode.Encoder;
import org.apache.sysml.runtime.transform.meta.TfMetaUtils;
import org.apache.sysml.runtime.util.UtilFunctions;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;

public class DummycodeAgent extends Encoder 
{		
	private static final long serialVersionUID = 5832130477659116489L;

	private HashMap<Integer, HashMap<String,String>> _finalMaps = null;
	private HashMap<Integer, HashMap<String,Long>> _finalMapsCP = null;
	private int[] _binList = null;
	private int[] _numBins = null;
	
	private int[] _domainSizes = null;			// length = #of dummycoded columns
	private int[] _dcdColumnMap = null;			// to help in translating between original and dummycoded column IDs
	private long _dummycodedLength = 0;			// #of columns after dummycoded

	public DummycodeAgent(JSONObject parsedSpec, String[] colnames, int clen) throws JSONException {
		super(null, clen);
		
		if ( parsedSpec.containsKey(TfUtils.TXMETHOD_DUMMYCODE) ) {
			int[] collist = TfMetaUtils.parseJsonIDList(parsedSpec, colnames, TfUtils.TXMETHOD_DUMMYCODE);
			initColList(collist);
		}
	}
	
	@Override
	public int getNumCols() {
		return (int)_dummycodedLength;
	}
	
	/**
	 * Method to output transformation metadata from the mappers. 
	 * This information is collected and merged by the reducers.
	 */
	@Override
	public void mapOutputTransformationMetadata(OutputCollector<IntWritable, DistinctValue> out, int taskID, TfUtils agents) throws IOException {
		// There is no metadata required for dummycode.
		// Required information is output from RecodeAgent.
		return;
	}
	
	@Override
	public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values,
			String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException {
		// Nothing to do here
	}

	public void setRecodeMaps(HashMap<Integer, HashMap<String,String>> maps) {
		_finalMaps = maps;
	}
	
	public void setRecodeMapsCP(HashMap<Integer, HashMap<String,Long>> maps) {
		_finalMapsCP = maps;
	}
	
	public void setNumBins(int[] binList, int[] numbins) {
		_binList = binList;
		_numBins = numbins;
	}
	
	/**
	 * Method to generate dummyCodedMaps.csv, with the range of column IDs for each variable in the original data.
	 * 
	 * Each line in dummyCodedMaps.csv file is of the form: [ColID, 1/0, st, end]
	 * 		1/0 indicates if ColID is dummycoded or not
	 * 		[st,end] is the range of dummycoded column numbers for the given ColID
	 * 
	 * It also generates coltypes.csv, with the type (scale, nominal, etc.) of columns in the output.
	 * Recoded columns are of type nominal, binner columns are of type ordinal, dummycoded columns are of type 
	 * dummycoded, and the remaining are of type scale.
	 * 
	 * @param fs file system
	 * @param txMtdDir path to transform metadata directory
	 * @param numCols number of columns
	 * @param agents ?
	 * @return ?
	 * @throws IOException if IOException occurs
	 */
	public int genDcdMapsAndColTypes(FileSystem fs, String txMtdDir, int numCols, TfUtils agents) throws IOException {
		
		// initialize all column types in the transformed data to SCALE
		TfUtils.ColumnTypes[] ctypes = new TfUtils.ColumnTypes[(int) _dummycodedLength];
		for(int i=0; i < _dummycodedLength; i++)
			ctypes[i] = TfUtils.ColumnTypes.SCALE;
		
		_dcdColumnMap = new int[numCols];

		int sum=1;
		try( BufferedWriter br=new BufferedWriter(new OutputStreamWriter(
				fs.create(new Path(txMtdDir+"/Dummycode/" + TfUtils.DCD_FILE_NAME),true))) ) {
			int idx = 0;
			for(int colID=1; colID <= numCols; colID++) 
			{
				if ( _colList != null && idx < _colList.length && _colList[idx] == colID )
				{
					br.write(colID + TfUtils.TXMTD_SEP + "1" + TfUtils.TXMTD_SEP + sum + TfUtils.TXMTD_SEP + (sum+_domainSizes[idx]-1) + "\n");
					_dcdColumnMap[colID-1] = (sum+_domainSizes[idx]-1)-1;
	
					for(int i=sum; i <=(sum+_domainSizes[idx]-1); i++)
						ctypes[i-1] = TfUtils.ColumnTypes.DUMMYCODED;
					
					sum += _domainSizes[idx];
					idx++;
				}
				else 
				{
					br.write(colID + TfUtils.TXMTD_SEP + "0" + TfUtils.TXMTD_SEP + sum + TfUtils.TXMTD_SEP + sum + "\n");
					_dcdColumnMap[colID-1] = sum-1;
					
					if ( agents.getBinAgent().isApplicable(colID) != -1 )
						ctypes[sum-1] = TfUtils.ColumnTypes.ORDINAL;	// binned variable results in an ordinal column
					
					if ( agents.getRecodeAgent().isApplicable(colID) != -1 )
						ctypes[sum-1] = TfUtils.ColumnTypes.NOMINAL;
					
					sum += 1;
				}
			}
		}

		// Write coltypes.csv
		try(BufferedWriter br=new BufferedWriter(new OutputStreamWriter(
				fs.create(new Path(txMtdDir + File.separator + TfUtils.TXMTD_COLTYPES),true))) ) {
			br.write(ctypes[0].toID() + "");
			for(int i = 1; i < _dummycodedLength; i++) 
				br.write( TfUtils.TXMTD_SEP + ctypes[i].toID() );
		}
		
		return sum-1;
	}
	
	/**
	 * Given a dummycoded column id, find the corresponding original column ID.
	 *  
	 * @param colID dummycoded column ID
	 * @return original column ID, -1 if not found
	 */
	public int mapDcdColumnID(int colID) 
	{
		for(int i=0; i < _dcdColumnMap.length; i++)
		{
			int st = (i==0 ? 1 : _dcdColumnMap[i-1]+1+1);
			int end = _dcdColumnMap[i]+1;
			//System.out.println((i+1) + ": " + "[" + st + "," + end + "]");
			
			if ( colID >= st && colID <= end)
				return i+1;
		}
		return -1;
	}
	
	public String constructDummycodedHeader(String header, Pattern delim) {
		
		if(_colList == null && _binList == null )
			// none of the columns are dummycoded, simply return the given header
			return header;
		
		String[] names = delim.split(header, -1);
		List<String> newNames = null;
		
		StringBuilder sb = new StringBuilder();
		
		// Dummycoding can be performed on either on a recoded column or on a binned column
		
		// process recoded columns
		if(_finalMapsCP != null && _colList != null) 
		{
			for(int i=0; i <_colList.length; i++) 
			{
				int colID = _colList[i];
				HashMap<String,Long> map = _finalMapsCP.get(colID);
				String colName = UtilFunctions.unquote(names[colID-1]);
				
				if ( map != null  ) 
				{
					// order map entries by their recodeID
					List<Map.Entry<String, Long>> entryList = new ArrayList<Map.Entry<String, Long>>(map.entrySet());
					Comparator<Map.Entry<String, Long>> comp = new Comparator<Map.Entry<String, Long>>() {
						@Override
						public int compare(Entry<String, Long> entry1, Entry<String, Long> entry2) {
							Long value1 = entry1.getValue();
							Long value2 = entry2.getValue();
							return (int) (value1 - value2);
						}
					};
					Collections.sort(entryList, comp);
					newNames = new ArrayList<String>();
					for (Entry<String, Long> entry : entryList) {
						newNames.add(entry.getKey());
					}
					
					// construct concatenated string of map entries
					sb.setLength(0);
					for(int idx=0; idx < newNames.size(); idx++) 
					{
						if(idx==0) 
							sb.append( colName + TfUtils.DCD_NAME_SEP + newNames.get(idx));
						else
							sb.append( delim + colName + TfUtils.DCD_NAME_SEP + newNames.get(idx));
					}
					names[colID-1] = sb.toString();			// replace original column name with dcd name
				}
			}
		}
		else if(_finalMaps != null && _colList != null) {
			for(int i=0; i <_colList.length; i++) {
				int colID = _colList[i];
				HashMap<String,String> map = _finalMaps.get(colID);
				String colName = UtilFunctions.unquote(names[colID-1]);
				
				if ( map != null ) 
				{
					
					// order map entries by their recodeID (represented as Strings .. "1", "2", etc.)
					List<Map.Entry<String, String>> entryList = new ArrayList<Map.Entry<String, String>>(map.entrySet());
					Comparator<Map.Entry<String, String>> comp = new Comparator<Map.Entry<String, String>>() {
						@Override
						public int compare(Entry<String, String> entry1, Entry<String, String> entry2) {
							String value1 = entry1.getValue();
							String value2 = entry2.getValue();
							return (Integer.parseInt(value1) - Integer.parseInt(value2));
						}
					};
					Collections.sort(entryList, comp);
					newNames = new ArrayList<String>();
					for (Entry<String, String> entry : entryList) {
						newNames.add(entry.getKey());
					}
					
					// construct concatenated string of map entries
					sb.setLength(0);
					for(int idx=0; idx < newNames.size(); idx++) 
					{
						if(idx==0) 
							sb.append( colName + TfUtils.DCD_NAME_SEP + newNames.get(idx));
						else
							sb.append( delim + colName + TfUtils.DCD_NAME_SEP + newNames.get(idx));
					}
					names[colID-1] = sb.toString();			// replace original column name with dcd name
				}
			}
		}
		
		// process binned columns
		if (_binList != null) 
			for(int i=0; i < _binList.length; i++) 
			{
				int colID = _binList[i];
				
				// need to consider only binned and dummycoded columns
				if(isApplicable(colID) == -1)
					continue;
				
				int numBins = _numBins[i];
				String colName = UtilFunctions.unquote(names[colID-1]);
				
				sb.setLength(0);
				for(int idx=0; idx < numBins; idx++) 
					if(idx==0) 
						sb.append( colName + TfUtils.DCD_NAME_SEP + "Bin" + (idx+1) );
					else
						sb.append( delim + colName + TfUtils.DCD_NAME_SEP + "Bin" + (idx+1) );
				names[colID-1] = sb.toString();			// replace original column name with dcd name
			}
		
		// Construct the full header
		sb.setLength(0);
		for(int colID=0; colID < names.length; colID++) 
		{
			if (colID == 0)
				sb.append(names[colID]);
			else
				sb.append(delim + names[colID]);
		}
		//System.out.println("DummycodedHeader: " + sb.toString());
		
		return sb.toString();
	}
	
	@Override
	public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException {
		if ( !isApplicable() ) {
			_dummycodedLength = _clen;
			return;
		}
		
		// sort to-be dummycoded column IDs in ascending order. This is the order in which the new dummycoded record is constructed in apply() function.
		Arrays.sort(_colList);	
		_domainSizes = new int[_colList.length];

		_dummycodedLength = _clen;
		
		//HashMap<String, String> map = null;
		for(int i=0; i<_colList.length; i++) {
			int colID = _colList[i];
			
			// Find the domain size for colID using _finalMaps or _finalMapsCP
			int domainSize = 0;
			if(_finalMaps != null) {
				if(_finalMaps.get(colID) != null)
					domainSize = _finalMaps.get(colID).size();
			}
			else {
				if(_finalMapsCP.get(colID) != null)
					domainSize = _finalMapsCP.get(colID).size();
			}
			
			if ( domainSize != 0 ) {
				// dummycoded column
				_domainSizes[i] = domainSize;
			}
			else {
				// binned column
				if ( _binList != null )
				for(int j=0; j<_binList.length; j++) {
					if (colID == _binList[j]) {
						_domainSizes[i] = _numBins[j];
						break;
					}
				}
			}
			_dummycodedLength += _domainSizes[i]-1;
		}
	}


	@Override
	public MatrixBlock encode(FrameBlock in, MatrixBlock out) {
		return apply(in, out);
	}

	@Override
	public void build(FrameBlock in) {
		//do nothing
	}
	
	/**
	 * Method to apply transformations.
	 * 
	 * @param words array of strings
	 * @return array of transformed strings
	 */
	@Override
	public String[] apply(String[] words) 
	{
		if( !isApplicable() )
			return words;
		
		String[] nwords = new String[(int)_dummycodedLength];
		int rcdVal = 0;
		
		for(int colID=1, idx=0, ncolID=1; colID <= words.length; colID++) {
			if(idx < _colList.length && colID==_colList[idx]) {
				// dummycoded columns
				try {
					rcdVal = UtilFunctions.parseToInt(UtilFunctions.unquote(words[colID-1]));
					nwords[ ncolID-1+rcdVal-1 ] = "1";
					ncolID += _domainSizes[idx];
					idx++;
				} 
				catch (Exception e) {
					throw new RuntimeException("Error in dummycoding: colID="+colID + ", rcdVal=" + rcdVal+", word="+words[colID-1] 
							+ ", domainSize=" + _domainSizes[idx] + ", dummyCodedLength=" + _dummycodedLength);
				}
			}
			else {
				nwords[ncolID-1] = words[colID-1];
				ncolID++;
			}
		}
		
		return nwords;
	}
	
	@Override
	public MatrixBlock apply(FrameBlock in, MatrixBlock out) 
	{
		MatrixBlock ret = new MatrixBlock(out.getNumRows(), (int)_dummycodedLength, false);
		
		for( int i=0; i<out.getNumRows(); i++ ) {
			for(int colID=1, idx=0, ncolID=1; colID <= out.getNumColumns(); colID++) {
				double val = out.quickGetValue(i, colID-1);
				if(idx < _colList.length && colID==_colList[idx]) {
					ret.quickSetValue(i, ncolID-1+(int)val-1, 1);
					ncolID += _domainSizes[idx];
					idx++;
				}
				else {
					double ptval = UtilFunctions.objectToDouble(in.getSchema()[colID-1], in.get(i, colID-1));
					ret.quickSetValue(i, ncolID-1, ptval);
					ncolID++;
				}
			}
		}
		
		return ret;
	}

	@Override
	public FrameBlock getMetaData(FrameBlock out) {
		return out;
	}
	
	@Override
	public void initMetaData(FrameBlock meta) {
		//initialize domain sizes and output num columns
		_domainSizes = new int[_colList.length];
		_dummycodedLength = _clen;
		for( int j=0; j<_colList.length; j++ ) {
			int colID = _colList[j]; //1-based
			_domainSizes[j] = (int)meta.getColumnMetadata()[colID-1].getNumDistinct();
			_dummycodedLength +=  _domainSizes[j]-1;
		}
	}
}