/**
* (C) Copyright IBM Corp. 2010, 2015
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.ibm.bi.dml.runtime.transform;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.CharacterCodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.wink.json4j.JSONArray;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;
import scala.Tuple2;
import com.ibm.bi.dml.runtime.transform.MVImputeAgent.MVMethod;
import com.ibm.bi.dml.runtime.util.UtilFunctions;
public class BinAgent extends TransformationAgent {
private static final long serialVersionUID = 1917445005206076078L;
public static final String MIN_PREFIX = "min";
public static final String MAX_PREFIX = "max";
public static final String NBINS_PREFIX = "nbins";
private int[] _binList = null;
//private byte[] _binMethodList = null; // Not used, since only equi-width is supported for now.
private int[] _numBins = null;
private double[] _min=null, _max=null; // min and max among non-missing values
private double[] _binWidths = null; // width of a bin for each attribute
BinAgent() { }
BinAgent(JSONObject parsedSpec) throws JSONException {
if ( !parsedSpec.containsKey(TX_METHOD.BIN.toString()) )
return;
JSONObject obj = (JSONObject) parsedSpec.get(TX_METHOD.BIN.toString());
JSONArray attrs = (JSONArray) obj.get(JSON_ATTRS);
//JSONArray mthds = (JSONArray) obj.get(JSON_MTHD);
JSONArray nbins = (JSONArray) obj.get(JSON_NBINS);
assert(attrs.size() == nbins.size());
_binList = new int[attrs.size()];
_numBins = new int[attrs.size()];
for(int i=0; i < _binList.length; i++) {
_binList[i] = UtilFunctions.toInt(attrs.get(i));
_numBins[i] = UtilFunctions.toInt(nbins.get(i));
}
// initialize internal transformation metadata
_min = new double[_binList.length];
Arrays.fill(_min, Double.MAX_VALUE);
_max = new double[_binList.length];
Arrays.fill(_max, -Double.MAX_VALUE);
_binWidths = new double[_binList.length];
}
public void prepare(String[] words, TfUtils agents) {
if ( _binList == null )
return;
for(int i=0; i <_binList.length; i++) {
int colID = _binList[i];
String w = null;
double d = 0;
// equi-width
w = UtilFunctions.unquote(words[colID-1].trim());
if(!agents.isNA(w)) {
d = UtilFunctions.parseToDouble(w);
if(d < _min[i])
_min[i] = d;
if(d > _max[i])
_max[i] = d;
}
}
}
private DistinctValue prepMinOutput(int idx) throws CharacterCodingException {
String s = MIN_PREFIX + Double.toString(_min[idx]);
return new DistinctValue(s, -1L);
}
private DistinctValue prepMaxOutput(int idx) throws CharacterCodingException {
String s = MAX_PREFIX + Double.toString(_max[idx]);
return new DistinctValue(s, -1L);
}
private DistinctValue prepNBinsOutput(int idx) throws CharacterCodingException {
String s = NBINS_PREFIX + Double.toString(_numBins[idx]);
return new DistinctValue(s, -1L);
}
/**
* Method to output transformation metadata from the mappers.
* This information is collected and merged by the reducers.
*
* @param out
* @throws IOException
*/
@Override
public void mapOutputTransformationMetadata(OutputCollector<IntWritable, DistinctValue> out, int taskID, TfUtils agents) throws IOException {
if ( _binList == null )
return;
try {
for(int i=0; i < _binList.length; i++) {
int colID = _binList[i];
IntWritable iw = new IntWritable(-colID);
out.collect(iw, prepMinOutput(i));
out.collect(iw, prepMaxOutput(i));
out.collect(iw, prepNBinsOutput(i));
}
} catch(Exception e) {
throw new IOException(e);
}
}
public ArrayList<Tuple2<Integer, DistinctValue>> mapOutputTransformationMetadata(int taskID, ArrayList<Tuple2<Integer, DistinctValue>> list, TfUtils agents) throws IOException {
if ( _binList == null )
return list;
try {
for(int i=0; i < _binList.length; i++) {
int colID = _binList[i];
Integer iw = -colID;
list.add( new Tuple2<Integer,DistinctValue>(iw, prepMinOutput(i)) );
list.add( new Tuple2<Integer,DistinctValue>(iw, prepMaxOutput(i)) );
list.add( new Tuple2<Integer,DistinctValue>(iw, prepNBinsOutput(i)) );
}
} catch(Exception e) {
throw new IOException(e);
}
return list;
}
private void writeTfMtd(int colID, String min, String max, String binwidth, String nbins, String tfMtdDir, FileSystem fs, TfUtils agents) throws IOException
{
Path pt = new Path(tfMtdDir+"/Bin/"+ agents.getName(colID) + BIN_FILE_SUFFIX);
BufferedWriter br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true)));
br.write(colID + TXMTD_SEP + min + TXMTD_SEP + max + TXMTD_SEP + binwidth + TXMTD_SEP + nbins + "\n");
br.close();
}
/**
* Method to merge map output transformation metadata.
*
* @param values
* @return
* @throws IOException
*/
@Override
public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values, String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException {
double min = Double.MAX_VALUE;
double max = -Double.MAX_VALUE;
int nbins = 0;
DistinctValue val = new DistinctValue();
String w = null;
double d;
while(values.hasNext()) {
val.reset();
val = values.next();
w = val.getWord();
if(w.startsWith(MIN_PREFIX)) {
d = UtilFunctions.parseToDouble(w.substring( MIN_PREFIX.length() ));
if ( d < min )
min = d;
}
else if(w.startsWith(MAX_PREFIX)) {
d = UtilFunctions.parseToDouble(w.substring( MAX_PREFIX.length() ));
if ( d > max )
max = d;
}
else if (w.startsWith(NBINS_PREFIX)) {
nbins = (int) UtilFunctions.parseToLong( w.substring(NBINS_PREFIX.length() ) );
}
else
throw new RuntimeException("MVImputeAgent: Invalid prefix while merging map output: " + w);
}
// write merged metadata
double binwidth = (max-min)/nbins;
writeTfMtd(colID, Double.toString(min), Double.toString(max), Double.toString(binwidth), Integer.toString(nbins), outputDir, fs, agents);
}
public void outputTransformationMetadata(String outputDir, FileSystem fs, TfUtils agents) throws IOException {
if(_binList == null)
return;
MVImputeAgent mvagent = agents.getMVImputeAgent();
for(int i=0; i < _binList.length; i++) {
int colID = _binList[i];
// If the column is imputed with a constant, then adjust min and max based the value of the constant.
if ( mvagent.isImputed(colID) != -1 && mvagent.getMethod(colID) == MVMethod.CONSTANT )
{
double cst = UtilFunctions.parseToDouble( mvagent.getReplacement(colID) );
if ( cst < _min[i])
_min[i] = cst;
if ( cst > _max[i])
_max[i] = cst;
}
double binwidth = (_max[i] - _min[i])/_numBins[i];
writeTfMtd(colID, Double.toString(_min[i]), Double.toString(_max[i]), Double.toString(binwidth), Integer.toString(_numBins[i]), outputDir, fs, agents);
}
}
// ------------------------------------------------------------------------------------------------
public int[] getBinList() { return _binList; }
public int[] getNumBins() { return _numBins; }
public double[] getMin() { return _min; }
public double[] getBinWidths() { return _binWidths; }
/**
* Method to load transform metadata for all attributes
*
* @param job
* @throws IOException
*/
@Override
public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException {
if ( _binList == null )
return;
if(fs.isDirectory(txMtdDir)) {
for(int i=0; i<_binList.length;i++) {
int colID = _binList[i];
Path path = new Path( txMtdDir + "/Bin/" + agents.getName(colID) + BIN_FILE_SUFFIX);
TfUtils.checkValidInputFile(fs, path, true);
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
// format: colID,min,max,nbins
String[] fields = br.readLine().split(TXMTD_SEP);
double min = UtilFunctions.parseToDouble(fields[1]);
//double max = UtilFunctions.parseToDouble(fields[2]);
double binwidth = UtilFunctions.parseToDouble(fields[3]);
int nbins = UtilFunctions.parseToInt(fields[4]);
_numBins[i] = nbins;
_min[i] = min;
_binWidths[i] = binwidth; // (max-min)/nbins;
br.close();
}
}
else {
fs.close();
throw new RuntimeException("Path to recode maps must be a directory: " + txMtdDir);
}
}
/**
* Method to apply transformations.
*
* @param words
* @return
*/
@Override
public String[] apply(String[] words, TfUtils agents) {
if ( _binList == null )
return words;
for(int i=0; i < _binList.length; i++) {
int colID = _binList[i];
try {
double val = UtilFunctions.parseToDouble(words[colID-1]);
int binid = 1;
double tmp = _min[i] + _binWidths[i];
while(val > tmp && binid < _numBins[i]) {
tmp += _binWidths[i];
binid++;
}
words[colID-1] = Integer.toString(binid);
} catch(NumberFormatException e)
{
throw new RuntimeException("Encountered \"" + words[colID-1] + "\" in column ID \"" + colID + "\", when expecting a numeric value. Consider adding \"" + words[colID-1] + "\" to na.strings, along with an appropriate imputation method.");
}
}
return words;
}
/**
* Check if the given column ID is subjected to this transformation.
*
*/
public int isBinned(int colID)
{
if(_binList == null)
return -1;
int idx = Arrays.binarySearch(_binList, colID);
return ( idx >= 0 ? idx : -1);
}
@Override
public void print() {
System.out.print("Binning List (Equi-width): \n ");
for(int i : _binList) {
System.out.print(i + " ");
}
System.out.print("\n ");
for(int b : _numBins) {
System.out.print(b + " ");
}
System.out.println();
}
}