/** * (C) Copyright IBM Corp. 2010, 2015 * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *  */ package com.ibm.bi.dml.runtime.transform; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.nio.charset.CharacterCodingException; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.wink.json4j.JSONArray; import org.apache.wink.json4j.JSONException; import org.apache.wink.json4j.JSONObject; import scala.Tuple2; import com.ibm.bi.dml.runtime.transform.MVImputeAgent.MVMethod; import com.ibm.bi.dml.runtime.util.UtilFunctions; public class BinAgent extends TransformationAgent { private static final long serialVersionUID = 1917445005206076078L; public static final String MIN_PREFIX = "min"; public static final String MAX_PREFIX = "max"; public static final String NBINS_PREFIX = "nbins"; private int[] _binList = null; //private byte[] _binMethodList = null; // Not used, since only equi-width is supported for now. private int[] _numBins = null; private double[] _min=null, _max=null; // min and max among non-missing values private double[] _binWidths = null; // width of a bin for each attribute BinAgent() { } BinAgent(JSONObject parsedSpec) throws JSONException { if ( !parsedSpec.containsKey(TX_METHOD.BIN.toString()) ) return; JSONObject obj = (JSONObject) parsedSpec.get(TX_METHOD.BIN.toString()); JSONArray attrs = (JSONArray) obj.get(JSON_ATTRS); //JSONArray mthds = (JSONArray) obj.get(JSON_MTHD); JSONArray nbins = (JSONArray) obj.get(JSON_NBINS); assert(attrs.size() == nbins.size()); _binList = new int[attrs.size()]; _numBins = new int[attrs.size()]; for(int i=0; i < _binList.length; i++) { _binList[i] = UtilFunctions.toInt(attrs.get(i)); _numBins[i] = UtilFunctions.toInt(nbins.get(i)); } // initialize internal transformation metadata _min = new double[_binList.length]; Arrays.fill(_min, Double.MAX_VALUE); _max = new double[_binList.length]; Arrays.fill(_max, -Double.MAX_VALUE); _binWidths = new double[_binList.length]; } public void prepare(String[] words, TfUtils agents) { if ( _binList == null ) return; for(int i=0; i <_binList.length; i++) { int colID = _binList[i]; String w = null; double d = 0; // equi-width w = UtilFunctions.unquote(words[colID-1].trim()); if(!agents.isNA(w)) { d = UtilFunctions.parseToDouble(w); if(d < _min[i]) _min[i] = d; if(d > _max[i]) _max[i] = d; } } } private DistinctValue prepMinOutput(int idx) throws CharacterCodingException { String s = MIN_PREFIX + Double.toString(_min[idx]); return new DistinctValue(s, -1L); } private DistinctValue prepMaxOutput(int idx) throws CharacterCodingException { String s = MAX_PREFIX + Double.toString(_max[idx]); return new DistinctValue(s, -1L); } private DistinctValue prepNBinsOutput(int idx) throws CharacterCodingException { String s = NBINS_PREFIX + Double.toString(_numBins[idx]); return new DistinctValue(s, -1L); } /** * Method to output transformation metadata from the mappers. * This information is collected and merged by the reducers. * * @param out * @throws IOException */ @Override public void mapOutputTransformationMetadata(OutputCollector<IntWritable, DistinctValue> out, int taskID, TfUtils agents) throws IOException { if ( _binList == null ) return; try { for(int i=0; i < _binList.length; i++) { int colID = _binList[i]; IntWritable iw = new IntWritable(-colID); out.collect(iw, prepMinOutput(i)); out.collect(iw, prepMaxOutput(i)); out.collect(iw, prepNBinsOutput(i)); } } catch(Exception e) { throw new IOException(e); } } public ArrayList<Tuple2<Integer, DistinctValue>> mapOutputTransformationMetadata(int taskID, ArrayList<Tuple2<Integer, DistinctValue>> list, TfUtils agents) throws IOException { if ( _binList == null ) return list; try { for(int i=0; i < _binList.length; i++) { int colID = _binList[i]; Integer iw = -colID; list.add( new Tuple2<Integer,DistinctValue>(iw, prepMinOutput(i)) ); list.add( new Tuple2<Integer,DistinctValue>(iw, prepMaxOutput(i)) ); list.add( new Tuple2<Integer,DistinctValue>(iw, prepNBinsOutput(i)) ); } } catch(Exception e) { throw new IOException(e); } return list; } private void writeTfMtd(int colID, String min, String max, String binwidth, String nbins, String tfMtdDir, FileSystem fs, TfUtils agents) throws IOException { Path pt = new Path(tfMtdDir+"/Bin/"+ agents.getName(colID) + BIN_FILE_SUFFIX); BufferedWriter br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))); br.write(colID + TXMTD_SEP + min + TXMTD_SEP + max + TXMTD_SEP + binwidth + TXMTD_SEP + nbins + "\n"); br.close(); } /** * Method to merge map output transformation metadata. * * @param values * @return * @throws IOException */ @Override public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values, String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException { double min = Double.MAX_VALUE; double max = -Double.MAX_VALUE; int nbins = 0; DistinctValue val = new DistinctValue(); String w = null; double d; while(values.hasNext()) { val.reset(); val = values.next(); w = val.getWord(); if(w.startsWith(MIN_PREFIX)) { d = UtilFunctions.parseToDouble(w.substring( MIN_PREFIX.length() )); if ( d < min ) min = d; } else if(w.startsWith(MAX_PREFIX)) { d = UtilFunctions.parseToDouble(w.substring( MAX_PREFIX.length() )); if ( d > max ) max = d; } else if (w.startsWith(NBINS_PREFIX)) { nbins = (int) UtilFunctions.parseToLong( w.substring(NBINS_PREFIX.length() ) ); } else throw new RuntimeException("MVImputeAgent: Invalid prefix while merging map output: " + w); } // write merged metadata double binwidth = (max-min)/nbins; writeTfMtd(colID, Double.toString(min), Double.toString(max), Double.toString(binwidth), Integer.toString(nbins), outputDir, fs, agents); } public void outputTransformationMetadata(String outputDir, FileSystem fs, TfUtils agents) throws IOException { if(_binList == null) return; MVImputeAgent mvagent = agents.getMVImputeAgent(); for(int i=0; i < _binList.length; i++) { int colID = _binList[i]; // If the column is imputed with a constant, then adjust min and max based the value of the constant. if ( mvagent.isImputed(colID) != -1 && mvagent.getMethod(colID) == MVMethod.CONSTANT ) { double cst = UtilFunctions.parseToDouble( mvagent.getReplacement(colID) ); if ( cst < _min[i]) _min[i] = cst; if ( cst > _max[i]) _max[i] = cst; } double binwidth = (_max[i] - _min[i])/_numBins[i]; writeTfMtd(colID, Double.toString(_min[i]), Double.toString(_max[i]), Double.toString(binwidth), Integer.toString(_numBins[i]), outputDir, fs, agents); } } // ------------------------------------------------------------------------------------------------ public int[] getBinList() { return _binList; } public int[] getNumBins() { return _numBins; } public double[] getMin() { return _min; } public double[] getBinWidths() { return _binWidths; } /** * Method to load transform metadata for all attributes * * @param job * @throws IOException */ @Override public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException { if ( _binList == null ) return; if(fs.isDirectory(txMtdDir)) { for(int i=0; i<_binList.length;i++) { int colID = _binList[i]; Path path = new Path( txMtdDir + "/Bin/" + agents.getName(colID) + BIN_FILE_SUFFIX); TfUtils.checkValidInputFile(fs, path, true); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))); // format: colID,min,max,nbins String[] fields = br.readLine().split(TXMTD_SEP); double min = UtilFunctions.parseToDouble(fields[1]); //double max = UtilFunctions.parseToDouble(fields[2]); double binwidth = UtilFunctions.parseToDouble(fields[3]); int nbins = UtilFunctions.parseToInt(fields[4]); _numBins[i] = nbins; _min[i] = min; _binWidths[i] = binwidth; // (max-min)/nbins; br.close(); } } else { fs.close(); throw new RuntimeException("Path to recode maps must be a directory: " + txMtdDir); } } /** * Method to apply transformations. * * @param words * @return */ @Override public String[] apply(String[] words, TfUtils agents) { if ( _binList == null ) return words; for(int i=0; i < _binList.length; i++) { int colID = _binList[i]; try { double val = UtilFunctions.parseToDouble(words[colID-1]); int binid = 1; double tmp = _min[i] + _binWidths[i]; while(val > tmp && binid < _numBins[i]) { tmp += _binWidths[i]; binid++; } words[colID-1] = Integer.toString(binid); } catch(NumberFormatException e) { throw new RuntimeException("Encountered \"" + words[colID-1] + "\" in column ID \"" + colID + "\", when expecting a numeric value. Consider adding \"" + words[colID-1] + "\" to na.strings, along with an appropriate imputation method."); } } return words; } /** * Check if the given column ID is subjected to this transformation. * */ public int isBinned(int colID) { if(_binList == null) return -1; int idx = Arrays.binarySearch(_binList, colID); return ( idx >= 0 ? idx : -1); } @Override public void print() { System.out.print("Binning List (Equi-width): \n "); for(int i : _binList) { System.out.print(i + " "); } System.out.print("\n "); for(int b : _numBins) { System.out.print(b + " "); } System.out.println(); } }