/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.sysml.runtime.transform; import java.io.BufferedWriter; import java.io.File; import java.io.IOException; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.regex.Pattern; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.sysml.runtime.matrix.data.FrameBlock; import org.apache.sysml.runtime.matrix.data.MatrixBlock; import org.apache.sysml.runtime.transform.encode.Encoder; import org.apache.sysml.runtime.transform.meta.TfMetaUtils; import org.apache.sysml.runtime.util.UtilFunctions; import org.apache.wink.json4j.JSONException; import org.apache.wink.json4j.JSONObject; public class DummycodeAgent extends Encoder { private static final long serialVersionUID = 5832130477659116489L; private HashMap<Integer, HashMap<String,String>> _finalMaps = null; private HashMap<Integer, HashMap<String,Long>> _finalMapsCP = null; private int[] _binList = null; private int[] _numBins = null; private int[] _domainSizes = null; // length = #of dummycoded columns private int[] _dcdColumnMap = null; // to help in translating between original and dummycoded column IDs private long _dummycodedLength = 0; // #of columns after dummycoded public DummycodeAgent(JSONObject parsedSpec, String[] colnames, int clen) throws JSONException { super(null, clen); if ( parsedSpec.containsKey(TfUtils.TXMETHOD_DUMMYCODE) ) { int[] collist = TfMetaUtils.parseJsonIDList(parsedSpec, colnames, TfUtils.TXMETHOD_DUMMYCODE); initColList(collist); } } @Override public int getNumCols() { return (int)_dummycodedLength; } /** * Method to output transformation metadata from the mappers. * This information is collected and merged by the reducers. */ @Override public void mapOutputTransformationMetadata(OutputCollector<IntWritable, DistinctValue> out, int taskID, TfUtils agents) throws IOException { // There is no metadata required for dummycode. // Required information is output from RecodeAgent. return; } @Override public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values, String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException { // Nothing to do here } public void setRecodeMaps(HashMap<Integer, HashMap<String,String>> maps) { _finalMaps = maps; } public void setRecodeMapsCP(HashMap<Integer, HashMap<String,Long>> maps) { _finalMapsCP = maps; } public void setNumBins(int[] binList, int[] numbins) { _binList = binList; _numBins = numbins; } /** * Method to generate dummyCodedMaps.csv, with the range of column IDs for each variable in the original data. * * Each line in dummyCodedMaps.csv file is of the form: [ColID, 1/0, st, end] * 1/0 indicates if ColID is dummycoded or not * [st,end] is the range of dummycoded column numbers for the given ColID * * It also generates coltypes.csv, with the type (scale, nominal, etc.) of columns in the output. * Recoded columns are of type nominal, binner columns are of type ordinal, dummycoded columns are of type * dummycoded, and the remaining are of type scale. * * @param fs file system * @param txMtdDir path to transform metadata directory * @param numCols number of columns * @param agents ? * @return ? * @throws IOException if IOException occurs */ public int genDcdMapsAndColTypes(FileSystem fs, String txMtdDir, int numCols, TfUtils agents) throws IOException { // initialize all column types in the transformed data to SCALE TfUtils.ColumnTypes[] ctypes = new TfUtils.ColumnTypes[(int) _dummycodedLength]; for(int i=0; i < _dummycodedLength; i++) ctypes[i] = TfUtils.ColumnTypes.SCALE; _dcdColumnMap = new int[numCols]; int sum=1; try( BufferedWriter br=new BufferedWriter(new OutputStreamWriter( fs.create(new Path(txMtdDir+"/Dummycode/" + TfUtils.DCD_FILE_NAME),true))) ) { int idx = 0; for(int colID=1; colID <= numCols; colID++) { if ( _colList != null && idx < _colList.length && _colList[idx] == colID ) { br.write(colID + TfUtils.TXMTD_SEP + "1" + TfUtils.TXMTD_SEP + sum + TfUtils.TXMTD_SEP + (sum+_domainSizes[idx]-1) + "\n"); _dcdColumnMap[colID-1] = (sum+_domainSizes[idx]-1)-1; for(int i=sum; i <=(sum+_domainSizes[idx]-1); i++) ctypes[i-1] = TfUtils.ColumnTypes.DUMMYCODED; sum += _domainSizes[idx]; idx++; } else { br.write(colID + TfUtils.TXMTD_SEP + "0" + TfUtils.TXMTD_SEP + sum + TfUtils.TXMTD_SEP + sum + "\n"); _dcdColumnMap[colID-1] = sum-1; if ( agents.getBinAgent().isApplicable(colID) != -1 ) ctypes[sum-1] = TfUtils.ColumnTypes.ORDINAL; // binned variable results in an ordinal column if ( agents.getRecodeAgent().isApplicable(colID) != -1 ) ctypes[sum-1] = TfUtils.ColumnTypes.NOMINAL; sum += 1; } } } // Write coltypes.csv try(BufferedWriter br=new BufferedWriter(new OutputStreamWriter( fs.create(new Path(txMtdDir + File.separator + TfUtils.TXMTD_COLTYPES),true))) ) { br.write(ctypes[0].toID() + ""); for(int i = 1; i < _dummycodedLength; i++) br.write( TfUtils.TXMTD_SEP + ctypes[i].toID() ); } return sum-1; } /** * Given a dummycoded column id, find the corresponding original column ID. * * @param colID dummycoded column ID * @return original column ID, -1 if not found */ public int mapDcdColumnID(int colID) { for(int i=0; i < _dcdColumnMap.length; i++) { int st = (i==0 ? 1 : _dcdColumnMap[i-1]+1+1); int end = _dcdColumnMap[i]+1; //System.out.println((i+1) + ": " + "[" + st + "," + end + "]"); if ( colID >= st && colID <= end) return i+1; } return -1; } public String constructDummycodedHeader(String header, Pattern delim) { if(_colList == null && _binList == null ) // none of the columns are dummycoded, simply return the given header return header; String[] names = delim.split(header, -1); List<String> newNames = null; StringBuilder sb = new StringBuilder(); // Dummycoding can be performed on either on a recoded column or on a binned column // process recoded columns if(_finalMapsCP != null && _colList != null) { for(int i=0; i <_colList.length; i++) { int colID = _colList[i]; HashMap<String,Long> map = _finalMapsCP.get(colID); String colName = UtilFunctions.unquote(names[colID-1]); if ( map != null ) { // order map entries by their recodeID List<Map.Entry<String, Long>> entryList = new ArrayList<Map.Entry<String, Long>>(map.entrySet()); Comparator<Map.Entry<String, Long>> comp = new Comparator<Map.Entry<String, Long>>() { @Override public int compare(Entry<String, Long> entry1, Entry<String, Long> entry2) { Long value1 = entry1.getValue(); Long value2 = entry2.getValue(); return (int) (value1 - value2); } }; Collections.sort(entryList, comp); newNames = new ArrayList<String>(); for (Entry<String, Long> entry : entryList) { newNames.add(entry.getKey()); } // construct concatenated string of map entries sb.setLength(0); for(int idx=0; idx < newNames.size(); idx++) { if(idx==0) sb.append( colName + TfUtils.DCD_NAME_SEP + newNames.get(idx)); else sb.append( delim + colName + TfUtils.DCD_NAME_SEP + newNames.get(idx)); } names[colID-1] = sb.toString(); // replace original column name with dcd name } } } else if(_finalMaps != null && _colList != null) { for(int i=0; i <_colList.length; i++) { int colID = _colList[i]; HashMap<String,String> map = _finalMaps.get(colID); String colName = UtilFunctions.unquote(names[colID-1]); if ( map != null ) { // order map entries by their recodeID (represented as Strings .. "1", "2", etc.) List<Map.Entry<String, String>> entryList = new ArrayList<Map.Entry<String, String>>(map.entrySet()); Comparator<Map.Entry<String, String>> comp = new Comparator<Map.Entry<String, String>>() { @Override public int compare(Entry<String, String> entry1, Entry<String, String> entry2) { String value1 = entry1.getValue(); String value2 = entry2.getValue(); return (Integer.parseInt(value1) - Integer.parseInt(value2)); } }; Collections.sort(entryList, comp); newNames = new ArrayList<String>(); for (Entry<String, String> entry : entryList) { newNames.add(entry.getKey()); } // construct concatenated string of map entries sb.setLength(0); for(int idx=0; idx < newNames.size(); idx++) { if(idx==0) sb.append( colName + TfUtils.DCD_NAME_SEP + newNames.get(idx)); else sb.append( delim + colName + TfUtils.DCD_NAME_SEP + newNames.get(idx)); } names[colID-1] = sb.toString(); // replace original column name with dcd name } } } // process binned columns if (_binList != null) for(int i=0; i < _binList.length; i++) { int colID = _binList[i]; // need to consider only binned and dummycoded columns if(isApplicable(colID) == -1) continue; int numBins = _numBins[i]; String colName = UtilFunctions.unquote(names[colID-1]); sb.setLength(0); for(int idx=0; idx < numBins; idx++) if(idx==0) sb.append( colName + TfUtils.DCD_NAME_SEP + "Bin" + (idx+1) ); else sb.append( delim + colName + TfUtils.DCD_NAME_SEP + "Bin" + (idx+1) ); names[colID-1] = sb.toString(); // replace original column name with dcd name } // Construct the full header sb.setLength(0); for(int colID=0; colID < names.length; colID++) { if (colID == 0) sb.append(names[colID]); else sb.append(delim + names[colID]); } //System.out.println("DummycodedHeader: " + sb.toString()); return sb.toString(); } @Override public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException { if ( !isApplicable() ) { _dummycodedLength = _clen; return; } // sort to-be dummycoded column IDs in ascending order. This is the order in which the new dummycoded record is constructed in apply() function. Arrays.sort(_colList); _domainSizes = new int[_colList.length]; _dummycodedLength = _clen; //HashMap<String, String> map = null; for(int i=0; i<_colList.length; i++) { int colID = _colList[i]; // Find the domain size for colID using _finalMaps or _finalMapsCP int domainSize = 0; if(_finalMaps != null) { if(_finalMaps.get(colID) != null) domainSize = _finalMaps.get(colID).size(); } else { if(_finalMapsCP.get(colID) != null) domainSize = _finalMapsCP.get(colID).size(); } if ( domainSize != 0 ) { // dummycoded column _domainSizes[i] = domainSize; } else { // binned column if ( _binList != null ) for(int j=0; j<_binList.length; j++) { if (colID == _binList[j]) { _domainSizes[i] = _numBins[j]; break; } } } _dummycodedLength += _domainSizes[i]-1; } } @Override public MatrixBlock encode(FrameBlock in, MatrixBlock out) { return apply(in, out); } @Override public void build(FrameBlock in) { //do nothing } /** * Method to apply transformations. * * @param words array of strings * @return array of transformed strings */ @Override public String[] apply(String[] words) { if( !isApplicable() ) return words; String[] nwords = new String[(int)_dummycodedLength]; int rcdVal = 0; for(int colID=1, idx=0, ncolID=1; colID <= words.length; colID++) { if(idx < _colList.length && colID==_colList[idx]) { // dummycoded columns try { rcdVal = UtilFunctions.parseToInt(UtilFunctions.unquote(words[colID-1])); nwords[ ncolID-1+rcdVal-1 ] = "1"; ncolID += _domainSizes[idx]; idx++; } catch (Exception e) { throw new RuntimeException("Error in dummycoding: colID="+colID + ", rcdVal=" + rcdVal+", word="+words[colID-1] + ", domainSize=" + _domainSizes[idx] + ", dummyCodedLength=" + _dummycodedLength); } } else { nwords[ncolID-1] = words[colID-1]; ncolID++; } } return nwords; } @Override public MatrixBlock apply(FrameBlock in, MatrixBlock out) { MatrixBlock ret = new MatrixBlock(out.getNumRows(), (int)_dummycodedLength, false); for( int i=0; i<out.getNumRows(); i++ ) { for(int colID=1, idx=0, ncolID=1; colID <= out.getNumColumns(); colID++) { double val = out.quickGetValue(i, colID-1); if(idx < _colList.length && colID==_colList[idx]) { ret.quickSetValue(i, ncolID-1+(int)val-1, 1); ncolID += _domainSizes[idx]; idx++; } else { double ptval = UtilFunctions.objectToDouble(in.getSchema()[colID-1], in.get(i, colID-1)); ret.quickSetValue(i, ncolID-1, ptval); ncolID++; } } } return ret; } @Override public FrameBlock getMetaData(FrameBlock out) { return out; } @Override public void initMetaData(FrameBlock meta) { //initialize domain sizes and output num columns _domainSizes = new int[_colList.length]; _dummycodedLength = _clen; for( int j=0; j<_colList.length; j++ ) { int colID = _colList[j]; //1-based _domainSizes[j] = (int)meta.getColumnMetadata()[colID-1].getNumDistinct(); _dummycodedLength += _domainSizes[j]-1; } } }