/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.transform;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.CharacterCodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang.ArrayUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.wink.json4j.JSONArray;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;
import org.apache.sysml.lops.Lop;
import org.apache.sysml.runtime.io.IOUtilFunctions;
import org.apache.sysml.runtime.matrix.data.FrameBlock;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.Pair;
import org.apache.sysml.runtime.transform.MVImputeAgent.MVMethod;
import org.apache.sysml.runtime.transform.encode.Encoder;
import org.apache.sysml.runtime.transform.meta.TfMetaUtils;
import org.apache.sysml.runtime.util.UtilFunctions;
public class BinAgent extends Encoder
{
private static final long serialVersionUID = 1917445005206076078L;
public static final String MIN_PREFIX = "min";
public static final String MAX_PREFIX = "max";
public static final String NBINS_PREFIX = "nbins";
private int[] _numBins = null;
private double[] _min=null, _max=null; // min and max among non-missing values
private double[] _binWidths = null; // width of a bin for each attribute
//frame transform-apply attributes
private double[][] _binMins = null;
private double[][] _binMaxs = null;
public BinAgent(JSONObject parsedSpec, String[] colnames, int clen)
throws JSONException, IOException
{
this(parsedSpec, colnames, clen, false);
}
public BinAgent(JSONObject parsedSpec, String[] colnames, int clen, boolean colsOnly)
throws JSONException, IOException
{
super( null, clen );
if ( !parsedSpec.containsKey(TfUtils.TXMETHOD_BIN) )
return;
if( colsOnly ) {
List<Integer> collist = TfMetaUtils.parseBinningColIDs(parsedSpec, colnames);
initColList(ArrayUtils.toPrimitive(collist.toArray(new Integer[0])));
}
else
{
JSONObject obj = (JSONObject) parsedSpec.get(TfUtils.TXMETHOD_BIN);
JSONArray attrs = (JSONArray) obj.get(TfUtils.JSON_ATTRS);
JSONArray nbins = (JSONArray) obj.get(TfUtils.JSON_NBINS);
initColList(attrs);
_numBins = new int[attrs.size()];
for(int i=0; i < _numBins.length; i++)
_numBins[i] = UtilFunctions.toInt(nbins.get(i));
// initialize internal transformation metadata
_min = new double[_colList.length];
Arrays.fill(_min, Double.MAX_VALUE);
_max = new double[_colList.length];
Arrays.fill(_max, -Double.MAX_VALUE);
_binWidths = new double[_colList.length];
}
}
public int[] getNumBins() { return _numBins; }
public double[] getMin() { return _min; }
public double[] getBinWidths() { return _binWidths; }
public void prepare(String[] words, TfUtils agents) {
if ( !isApplicable() )
return;
for(int i=0; i <_colList.length; i++) {
int colID = _colList[i];
String w = null;
double d = 0;
// equi-width
w = UtilFunctions.unquote(words[colID-1].trim());
if(!TfUtils.isNA(agents.getNAStrings(),w)) {
d = UtilFunctions.parseToDouble(w);
if(d < _min[i])
_min[i] = d;
if(d > _max[i])
_max[i] = d;
}
}
}
private DistinctValue prepMinOutput(int idx) throws CharacterCodingException {
String s = MIN_PREFIX + Double.toString(_min[idx]);
return new DistinctValue(s, -1L);
}
private DistinctValue prepMaxOutput(int idx) throws CharacterCodingException {
String s = MAX_PREFIX + Double.toString(_max[idx]);
return new DistinctValue(s, -1L);
}
private DistinctValue prepNBinsOutput(int idx) throws CharacterCodingException {
String s = NBINS_PREFIX + Double.toString(_numBins[idx]);
return new DistinctValue(s, -1L);
}
/**
* Method to output transformation metadata from the mappers.
* This information is collected and merged by the reducers.
*/
@Override
public void mapOutputTransformationMetadata(OutputCollector<IntWritable, DistinctValue> out, int taskID, TfUtils agents) throws IOException {
if( !isApplicable() )
return;
try {
for(int i=0; i < _colList.length; i++) {
int colID = _colList[i];
IntWritable iw = new IntWritable(-colID);
out.collect(iw, prepMinOutput(i));
out.collect(iw, prepMaxOutput(i));
out.collect(iw, prepNBinsOutput(i));
}
} catch(Exception e) {
throw new IOException(e);
}
}
public ArrayList<Pair<Integer, DistinctValue>> mapOutputTransformationMetadata(int taskID, ArrayList<Pair<Integer, DistinctValue>> list, TfUtils agents) throws IOException {
if ( !isApplicable() )
return list;
try {
for(int i=0; i < _colList.length; i++) {
int colID = _colList[i];
Integer iw = -colID;
list.add( new Pair<Integer,DistinctValue>(iw, prepMinOutput(i)) );
list.add( new Pair<Integer,DistinctValue>(iw, prepMaxOutput(i)) );
list.add( new Pair<Integer,DistinctValue>(iw, prepNBinsOutput(i)) );
}
} catch(Exception e) {
throw new IOException(e);
}
return list;
}
private void writeTfMtd(int colID, String min, String max, String binwidth, String nbins, String tfMtdDir, FileSystem fs, TfUtils agents) throws IOException
{
Path pt = new Path(tfMtdDir+"/Bin/"+ agents.getName(colID) + TfUtils.TXMTD_BIN_FILE_SUFFIX);
BufferedWriter br = null;
try {
br = new BufferedWriter(new OutputStreamWriter(fs.create(pt,true)));
br.write(colID + TfUtils.TXMTD_SEP + min + TfUtils.TXMTD_SEP + max + TfUtils.TXMTD_SEP + binwidth + TfUtils.TXMTD_SEP + nbins + "\n");
}
finally {
IOUtilFunctions.closeSilently(br);
}
}
/**
* Method to merge map output transformation metadata.
*/
@Override
public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values, String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException {
double min = Double.MAX_VALUE;
double max = -Double.MAX_VALUE;
int nbins = 0;
DistinctValue val = new DistinctValue();
String w = null;
double d;
while(values.hasNext()) {
val.reset();
val = values.next();
w = val.getWord();
if(w.startsWith(MIN_PREFIX)) {
d = UtilFunctions.parseToDouble(w.substring( MIN_PREFIX.length() ));
if ( d < min )
min = d;
}
else if(w.startsWith(MAX_PREFIX)) {
d = UtilFunctions.parseToDouble(w.substring( MAX_PREFIX.length() ));
if ( d > max )
max = d;
}
else if (w.startsWith(NBINS_PREFIX)) {
nbins = (int) UtilFunctions.parseToLong( w.substring(NBINS_PREFIX.length() ) );
}
else
throw new RuntimeException("MVImputeAgent: Invalid prefix while merging map output: " + w);
}
// write merged metadata
double binwidth = (max-min)/nbins;
writeTfMtd(colID, Double.toString(min), Double.toString(max), Double.toString(binwidth), Integer.toString(nbins), outputDir, fs, agents);
}
public void outputTransformationMetadata(String outputDir, FileSystem fs, TfUtils agents) throws IOException {
if( !isApplicable() )
return;
MVImputeAgent mvagent = agents.getMVImputeAgent();
for(int i=0; i < _colList.length; i++) {
int colID = _colList[i];
// If the column is imputed with a constant, then adjust min and max based the value of the constant.
if ( mvagent.isApplicable(colID) != -1 && mvagent.getMethod(colID) == MVMethod.CONSTANT )
{
double cst = UtilFunctions.parseToDouble( mvagent.getReplacement(colID) );
if ( cst < _min[i])
_min[i] = cst;
if ( cst > _max[i])
_max[i] = cst;
}
double binwidth = (_max[i] - _min[i])/_numBins[i];
writeTfMtd(colID, Double.toString(_min[i]), Double.toString(_max[i]), Double.toString(binwidth), Integer.toString(_numBins[i]), outputDir, fs, agents);
}
}
// ------------------------------------------------------------------------------------------------
/**
* Method to load transform metadata for all attributes
*/
@Override
public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException {
if( !isApplicable() )
return;
if(fs.isDirectory(txMtdDir)) {
for(int i=0; i<_colList.length;i++) {
int colID = _colList[i];
Path path = new Path( txMtdDir + "/Bin/" + agents.getName(colID) + TfUtils.TXMTD_BIN_FILE_SUFFIX);
TfUtils.checkValidInputFile(fs, path, true);
BufferedReader br = null;
try {
br = new BufferedReader(new InputStreamReader(fs.open(path)));
// format: colID,min,max,nbins
String[] fields = br.readLine().split(TfUtils.TXMTD_SEP);
double min = UtilFunctions.parseToDouble(fields[1]);
//double max = UtilFunctions.parseToDouble(fields[2]);
double binwidth = UtilFunctions.parseToDouble(fields[3]);
int nbins = UtilFunctions.parseToInt(fields[4]);
_numBins[i] = nbins;
_min[i] = min;
_binWidths[i] = binwidth; // (max-min)/nbins;
}
finally {
IOUtilFunctions.closeSilently(br);
}
}
}
else {
throw new RuntimeException("Path to recode maps must be a directory: " + txMtdDir);
}
}
@Override
public MatrixBlock encode(FrameBlock in, MatrixBlock out) {
build(in);
return apply(in, out);
}
@Override
public void build(FrameBlock in) {
// TODO Auto-generated method stub
}
/**
* Method to apply transformations.
*/
@Override
public String[] apply(String[] words) {
if( !isApplicable() )
return words;
for(int i=0; i < _colList.length; i++) {
int colID = _colList[i];
try {
double val = UtilFunctions.parseToDouble(words[colID-1]);
int binid = 1;
double tmp = _min[i] + _binWidths[i];
while(val > tmp && binid < _numBins[i]) {
tmp += _binWidths[i];
binid++;
}
words[colID-1] = Integer.toString(binid);
}
catch(NumberFormatException e) {
throw new RuntimeException("Encountered \"" + words[colID-1] + "\" in column ID \"" + colID + "\", when expecting a numeric value. Consider adding \"" + words[colID-1] + "\" to na.strings, along with an appropriate imputation method.");
}
}
return words;
}
@Override
public MatrixBlock apply(FrameBlock in, MatrixBlock out) {
for(int j=0; j<_colList.length; j++) {
int colID = _colList[j];
for( int i=0; i<in.getNumRows(); i++ ) {
double inVal = UtilFunctions.objectToDouble(
in.getSchema()[colID-1], in.get(i, colID-1));
int ix = Arrays.binarySearch(_binMaxs[j], inVal);
int binID = ((ix < 0) ? Math.abs(ix+1) : ix) + 1;
out.quickSetValue(i, colID-1, binID);
}
}
return out;
}
@Override
public FrameBlock getMetaData(FrameBlock meta) {
return meta;
}
@Override
public void initMetaData(FrameBlock meta) {
_binMins = new double[_colList.length][];
_binMaxs = new double[_colList.length][];
for( int j=0; j<_colList.length; j++ ) {
int colID = _colList[j]; //1-based
int nbins = (int)meta.getColumnMetadata()[colID-1].getNumDistinct();
_binMins[j] = new double[nbins];
_binMaxs[j] = new double[nbins];
for( int i=0; i<nbins; i++ ) {
String[] tmp = meta.get(i, colID-1).toString().split(Lop.DATATYPE_PREFIX);
_binMins[j][i] = Double.parseDouble(tmp[0]);
_binMaxs[j][i] = Double.parseDouble(tmp[1]);
}
}
}
}