/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *  */ package com.ibm.bi.dml.runtime.transform; import java.io.BufferedWriter; import java.io.IOException; import java.io.OutputStreamWriter; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.wink.json4j.JSONArray; import org.apache.wink.json4j.JSONException; import org.apache.wink.json4j.JSONObject; import com.google.common.base.Functions; import com.google.common.collect.Ordering; import com.ibm.bi.dml.runtime.util.UtilFunctions; public class DummycodeAgent extends TransformationAgent { private static final long serialVersionUID = 5832130477659116489L; private int[] _dcdList = null; private long numCols = 0; private HashMap<Integer, HashMap<String,String>> _finalMaps = null; private HashMap<Integer, HashMap<String,Long>> _finalMapsCP = null; private int[] _binList = null; private int[] _numBins = null; private int[] _domainSizes = null; // length = #of dummycoded columns private int[] _dcdColumnMap = null; // to help in translating between original and dummycoded column IDs private long _dummycodedLength = 0; // #of columns after dummycoded DummycodeAgent(int[] list) { _dcdList = list; } DummycodeAgent(JSONObject parsedSpec, long ncol) throws JSONException { numCols = ncol; if ( !parsedSpec.containsKey(TX_METHOD.DUMMYCODE.toString()) ) return; JSONObject obj = (JSONObject) parsedSpec.get(TX_METHOD.DUMMYCODE.toString()); JSONArray attrs = (JSONArray) obj.get(JSON_ATTRS); _dcdList = new int[attrs.size()]; for(int i=0; i < _dcdList.length; i++) _dcdList[i] = UtilFunctions.toInt(attrs.get(i)); } public int[] dcdList() { return _dcdList; } /** * Method to output transformation metadata from the mappers. * This information is collected and merged by the reducers. * * @param out * @throws IOException * */ @Override public void mapOutputTransformationMetadata(OutputCollector<IntWritable, DistinctValue> out, int taskID, TfUtils agents) throws IOException { // There is no metadata required for dummycode. // Required information is output from RecodeAgent. return; } @Override public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values, String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException { // Nothing to do here } public void setRecodeMaps(HashMap<Integer, HashMap<String,String>> maps) { _finalMaps = maps; } public void setRecodeMapsCP(HashMap<Integer, HashMap<String,Long>> maps) { _finalMapsCP = maps; } public void setNumBins(int[] binList, int[] numbins) { _binList = binList; _numBins = numbins; } /** * Method to generate dummyCodedMaps.csv, with the range of column IDs for each variable in the original data. * * Each line in dummyCodedMaps.csv file is of the form: [ColID, 1/0, st, end] * 1/0 indicates if ColID is dummycoded or not * [st,end] is the range of dummycoded column numbers for the given ColID * * It also generates coltypes.csv, with the type (scale, nominal, etc.) of columns in the output. * Recoded columns are of type nominal, binner columns are of type ordinal, dummycoded columns are of type * dummycoded, and the remaining are of type scale. * * @param fs * @param txMtdDir * @param numCols * @param ra * @param ba * @return Number of columns in the transformed data * @throws IOException */ public int genDcdMapsAndColTypes(FileSystem fs, String txMtdDir, int numCols, TfUtils agents) throws IOException { // initialize all column types in the transformed data to SCALE ColumnTypes[] ctypes = new ColumnTypes[(int) _dummycodedLength]; for(int i=0; i < _dummycodedLength; i++) ctypes[i] = ColumnTypes.SCALE; _dcdColumnMap = new int[numCols]; Path pt=new Path(txMtdDir+"/Dummycode/" + DCD_FILE_NAME); BufferedWriter br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))); int sum=1; int idx = 0; for(int colID=1; colID <= numCols; colID++) { if ( _dcdList != null && idx < _dcdList.length && _dcdList[idx] == colID ) { br.write(colID + "," + "1" + "," + sum + "," + (sum+_domainSizes[idx]-1) + "\n"); _dcdColumnMap[colID-1] = (sum+_domainSizes[idx]-1)-1; for(int i=sum; i <=(sum+_domainSizes[idx]-1); i++) ctypes[i-1] = ColumnTypes.DUMMYCODED; sum += _domainSizes[idx]; idx++; } else { br.write(colID + "," + "0" + "," + sum + "," + sum + "\n"); _dcdColumnMap[colID-1] = sum-1; if ( agents.getBinAgent().isBinned(colID) != -1 ) ctypes[sum-1] = ColumnTypes.ORDINAL; // binned variable results in an ordinal column if ( agents.getRecodeAgent().isRecoded(colID) != -1 ) ctypes[sum-1] = ColumnTypes.NOMINAL; sum += 1; } } br.close(); // Write coltypes.csv pt=new Path(txMtdDir+"/" + COLTYPES_FILE_NAME); br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))); br.write(columnTypeToID(ctypes[0]) + ""); for(int i = 1; i < _dummycodedLength; i++) br.write( "," + columnTypeToID(ctypes[i])); br.close(); return sum-1; } /** * Given a dummycoded column id, find the corresponding original column ID. * * @param colID * @return */ public int mapDcdColumnID(int colID) { for(int i=0; i < _dcdColumnMap.length; i++) { int st = (i==0 ? 1 : _dcdColumnMap[i-1]+1+1); int end = _dcdColumnMap[i]+1; //System.out.println((i+1) + ": " + "[" + st + "," + end + "]"); if ( colID >= st && colID <= end) return i+1; } return -1; } public String constructDummycodedHeader(String header, Pattern delim) { if(_dcdList == null && _binList == null ) // none of the columns are dummycoded, simply return the given header return header; String[] names = delim.split(header, -1); List<String> newNames = null; StringBuilder sb = new StringBuilder(); // Dummycoding can be performed on either on a recoded column or on a binned column // process recoded columns if(_finalMapsCP != null && _dcdList != null) { for(int i=0; i <_dcdList.length; i++) { int colID = _dcdList[i]; HashMap<String,Long> map = _finalMapsCP.get(colID); String colName = UtilFunctions.unquote(names[colID-1]); if ( map != null ) { // order map entries by their recodeID Ordering<String> valueComparator = Ordering.natural().onResultOf(Functions.forMap(map)); newNames = valueComparator.sortedCopy(map.keySet()); // construct concatenated string of map entries sb.setLength(0); for(int idx=0; idx < newNames.size(); idx++) { if(idx==0) sb.append( colName + DCD_NAME_SEP + newNames.get(idx)); else sb.append( delim + colName + DCD_NAME_SEP + newNames.get(idx)); } names[colID-1] = sb.toString(); // replace original column name with dcd name } } } else if(_finalMaps != null && _dcdList != null) { for(int i=0; i <_dcdList.length; i++) { int colID = _dcdList[i]; HashMap<String,String> map = _finalMaps.get(colID); String colName = UtilFunctions.unquote(names[colID-1]); if ( map != null ) { // order map entries by their recodeID (represented as Strings .. "1", "2", etc.) Ordering<String> orderByID = new Ordering<String>() { public int compare(String s1, String s2) { return (Integer.parseInt(s1) - Integer.parseInt(s2)); } }; newNames = orderByID.onResultOf(Functions.forMap(map)).sortedCopy(map.keySet()); // construct concatenated string of map entries sb.setLength(0); for(int idx=0; idx < newNames.size(); idx++) { if(idx==0) sb.append( colName + DCD_NAME_SEP + newNames.get(idx)); else sb.append( delim + colName + DCD_NAME_SEP + newNames.get(idx)); } names[colID-1] = sb.toString(); // replace original column name with dcd name } } } // process binned columns if (_binList != null) for(int i=0; i < _binList.length; i++) { int colID = _binList[i]; // need to consider only binned and dummycoded columns if(isDummyCoded(colID) == -1) continue; int numBins = _numBins[i]; String colName = UtilFunctions.unquote(names[colID-1]); sb.setLength(0); for(int idx=0; idx < numBins; idx++) if(idx==0) sb.append( colName + DCD_NAME_SEP + "Bin" + (idx+1) ); else sb.append( delim + colName + DCD_NAME_SEP + "Bin" + (idx+1) ); names[colID-1] = sb.toString(); // replace original column name with dcd name } // Construct the full header sb.setLength(0); for(int colID=0; colID < names.length; colID++) { if (colID == 0) sb.append(names[colID]); else sb.append(delim + names[colID]); } //System.out.println("DummycodedHeader: " + sb.toString()); return sb.toString(); } @Override public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException { if ( _dcdList == null ) { _dummycodedLength = numCols; return; } // sort to-be dummycoded column IDs in ascending order. This is the order in which the new dummycoded record is constructed in apply() function. Arrays.sort(_dcdList); _domainSizes = new int[_dcdList.length]; _dummycodedLength = numCols; //HashMap<String, String> map = null; for(int i=0; i<_dcdList.length; i++) { int colID = _dcdList[i]; // Find the domain size for colID using _finalMaps or _finalMapsCP int domainSize = 0; if(_finalMaps != null) { if(_finalMaps.get(colID) != null) domainSize = _finalMaps.get(colID).size(); } else { if(_finalMapsCP.get(colID) != null) domainSize = _finalMapsCP.get(colID).size(); } if ( domainSize != 0 ) { // dummycoded column _domainSizes[i] = domainSize; } else { // binned column if ( _binList != null ) for(int j=0; j<_binList.length; j++) { if (colID == _binList[j]) { _domainSizes[i] = _numBins[j]; break; } } } _dummycodedLength += _domainSizes[i]-1; //System.out.println("colID=" + colID + ", domainsize=" + _domainSizes[i] + ", dcdLength=" + _dummycodedLength); } } /** * Method to apply transformations. * * @param words * @return */ @Override public String[] apply(String[] words, TfUtils agents) { if ( _dcdList == null ) return words; String[] nwords = new String[(int)_dummycodedLength]; int rcdVal = 0; for(int colID=1, idx=0, ncolID=1; colID <= words.length; colID++) { if(idx < _dcdList.length && colID==_dcdList[idx]) { // dummycoded columns try { rcdVal = UtilFunctions.parseToInt(UtilFunctions.unquote(words[colID-1])); nwords[ ncolID-1+rcdVal-1 ] = "1"; ncolID += _domainSizes[idx]; idx++; } catch (Exception e) { System.out.println("Error in dummycoding: colID="+colID + ", rcdVal=" + rcdVal+", word="+words[colID-1] + ", domainSize=" + _domainSizes[idx] + ", dummyCodedLength=" + _dummycodedLength); throw new RuntimeException(e); } } else { nwords[ncolID-1] = words[colID-1]; ncolID++; } } return nwords; } /** * Check if the given column ID is subjected to this transformation. * */ public int isDummyCoded(int colID) { if(_dcdList == null) return -1; int idx = Arrays.binarySearch(_dcdList, colID); return ( idx >= 0 ? idx : -1); } @Override public void print() { System.out.print("Dummycoding List: \n "); for(int i : _dcdList) { System.out.print(i + " "); } System.out.println(); } }