/**
* (C) Copyright IBM Corp. 2010, 2015
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package com.ibm.bi.dml.runtime.transform;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.wink.json4j.JSONArray;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;
import scala.Tuple2;
import com.google.common.collect.Ordering;
import com.ibm.bi.dml.runtime.transform.MVImputeAgent.MVMethod;
import com.ibm.bi.dml.runtime.util.UtilFunctions;
public class RecodeAgent extends TransformationAgent {
private static final long serialVersionUID = 8213163881283341874L;
private int[] _rcdList = null;
private int[] _mvrcdList = null;
private int[] _fullrcdList = null;
// HashMap< columnID, HashMap<distinctValue, count> >
private HashMap<Integer, HashMap<String, Long>> _rcdMaps = new HashMap<Integer, HashMap<String, Long>>();
RecodeAgent(JSONObject parsedSpec) throws JSONException {
int rcdCount = 0;
if ( parsedSpec.containsKey(TX_METHOD.RECODE.toString()))
{
JSONObject obj = (JSONObject) parsedSpec.get(TX_METHOD.RECODE.toString());
JSONArray attrs = (JSONArray) obj.get(JSON_ATTRS);
_rcdList = new int[attrs.size()];
for(int i=0; i < _rcdList.length; i++)
_rcdList[i] = UtilFunctions.toInt(attrs.get(i));
rcdCount = _rcdList.length;
}
if ( parsedSpec.containsKey(TX_METHOD.MVRCD.toString()))
{
JSONObject obj = (JSONObject) parsedSpec.get(TX_METHOD.MVRCD.toString());
JSONArray attrs = (JSONArray) obj.get(JSON_ATTRS);
_mvrcdList = new int[attrs.size()];
for(int i=0; i < _mvrcdList.length; i++)
_mvrcdList[i] = UtilFunctions.toInt(attrs.get(i));
rcdCount += attrs.size();
}
if ( rcdCount > 0 )
{
_fullrcdList = new int[rcdCount];
int idx = -1;
if(_rcdList != null)
for(int i=0; i < _rcdList.length; i++)
_fullrcdList[++idx] = _rcdList[i];
if(_mvrcdList != null)
for(int i=0; i < _mvrcdList.length; i++)
_fullrcdList[++idx] = _mvrcdList[i];
}
}
void prepare(String[] words, TfUtils agents) {
if ( _rcdList == null && _mvrcdList == null )
return;
String w = null;
for (int colID : _fullrcdList) {
w = UtilFunctions.unquote(words[colID-1].trim());
if(_rcdMaps.get(colID) == null )
_rcdMaps.put(colID, new HashMap<String, Long>());
HashMap<String, Long> map = _rcdMaps.get(colID);
Long count = map.get(w);
if(count == null)
map.put(w, new Long(1));
else
map.put(w, count+1);
}
}
private HashMap<String, Long> handleMVConstant(int colID, TfUtils agents, HashMap<String, Long> map)
{
MVImputeAgent mvagent = agents.getMVImputeAgent();
if ( mvagent.getMethod(colID) == MVMethod.CONSTANT )
{
// check if the "replacement" is part of the map. If not, add it.
String repValue = mvagent.getReplacement(colID);
if(repValue == null)
throw new RuntimeException("Expecting a constant replacement value for column ID " + colID);
repValue = UtilFunctions.unquote(repValue);
Long count = map.get(repValue);
long mvCount = agents.getValid() - mvagent.getNonMVCount(colID);
if(count == null)
map.put(repValue, mvCount);
else
map.put(repValue, count + mvCount);
}
return map;
}
/**
* Method to output transformation metadata from the mappers.
* This information is collected and merged by the reducers.
*
* @param out
* @throws IOException
*/
@Override
public void mapOutputTransformationMetadata(OutputCollector<IntWritable, DistinctValue> out, int taskID, TfUtils agents) throws IOException {
mapOutputHelper(taskID, out, null, agents);
}
public ArrayList<Tuple2<Integer, DistinctValue>> mapOutputTransformationMetadata(int taskID, ArrayList<Tuple2<Integer, DistinctValue>> list, TfUtils agents) throws IOException {
mapOutputHelper(taskID, null, list, agents);
return list;
}
public void mapOutputHelper(int taskID, OutputCollector<IntWritable, DistinctValue> out, ArrayList<Tuple2<Integer, DistinctValue>> list, TfUtils agents) throws IOException {
if ( _rcdList == null && _mvrcdList == null )
return;
try
{
for(int i=0; i < _fullrcdList.length; i++)
{
int colID = _fullrcdList[i];
HashMap<String, Long> map = _rcdMaps.get(colID);
if(map != null)
{
map = handleMVConstant(colID, agents, map);
if ( out != null ) {
IntWritable iw = new IntWritable(colID);
for(String s : map.keySet())
out.collect(iw, new DistinctValue(s, map.get(s)));
}
else if ( list != null ) {
for(String s : map.keySet())
list.add(new Tuple2<Integer,DistinctValue>(colID, new DistinctValue(s, map.get(s))) );
}
}
}
} catch(Exception e) {
throw new IOException(e);
}
}
/**
* Function to output transformation metadata, including:
* - recode maps,
* - number of distinct values,
* - mode, and
* - imputation value (in the case of global_mode)
*
* The column for which this function is invoked can be one of the following:
* - just recoded (write .map, .ndistinct, .mode)
* - just mv imputed (w/ global_mode) (write .impute)
* - both recoded and mv imputed (write .map, .ndistinct, .mode, .impute)
*
* @param map
* @param outputDir
* @param colID
* @param fs
* @param mvagent
* @throws IOException
*/
private void writeMetadata(HashMap<String,Long> map, String outputDir, int colID, FileSystem fs, TfUtils agents, boolean fromCP) throws IOException {
// output recode maps and mode
MVImputeAgent mvagent = agents.getMVImputeAgent();
String mode = null;
Long count = null;
int rcdIndex = 0, modeIndex = 0;
long maxCount = Long.MIN_VALUE;
boolean isRecoded = (isRecoded(colID) != -1);
boolean isModeImputed = (mvagent.getMethod(colID) == MVMethod.GLOBAL_MODE);
Path pt=new Path(outputDir+"/Recode/"+ agents.getName(colID) + RCD_MAP_FILE_SUFFIX);
BufferedWriter br=null;
if(isRecoded)
br = new BufferedWriter(new OutputStreamWriter(fs.create(pt,true)));
// remove NA strings
if ( agents.getNAStrings() != null)
for(String naword : agents.getNAStrings())
map.remove(naword);
if(fromCP)
map = handleMVConstant(colID, agents, map);
if ( map.size() == 0 )
throw new RuntimeException("Can not proceed since \"" + agents.getName(colID) + "\" (id=" + colID + ") contains only the missing values, and not a single valid value -- set imputation method to \"constant\".");
// Order entries by category (string) value
Ordering<String> valueComparator = Ordering.natural();
List<String> newNames = valueComparator.sortedCopy(map.keySet());
for(String w : newNames) { //map.keySet()) {
count = map.get(w);
++rcdIndex;
// output (w, count, rcdIndex)
if(br != null)
br.write(UtilFunctions.quote(w) + TXMTD_SEP + rcdIndex + TXMTD_SEP + count + "\n");
if(maxCount < count) {
maxCount = count;
mode = w;
modeIndex = rcdIndex;
}
// Replace count with recode index (useful when invoked from CP)
map.put(w, (long)rcdIndex);
}
if(br != null)
br.close();
if ( mode == null ) {
mode = "";
maxCount = 0;
}
if ( isRecoded )
{
// output mode
pt=new Path(outputDir+"/Recode/"+ agents.getName(colID) + MODE_FILE_SUFFIX);
br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true)));
br.write(UtilFunctions.quote(mode) + "," + modeIndex + "," + maxCount );
br.close();
// output number of distinct values
pt=new Path(outputDir+"/Recode/"+ agents.getName(colID) + NDISTINCT_FILE_SUFFIX);
br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true)));
br.write(""+map.size());
br.close();
}
if (isModeImputed)
{
pt=new Path(outputDir+"/Impute/"+ agents.getName(colID) + MV_FILE_SUFFIX);
br=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true)));
br.write(colID + "," + UtilFunctions.quote(mode));
br.close();
}
}
public void outputTransformationMetadata(String outputDir, FileSystem fs, TfUtils agents) throws IOException {
if(_rcdList == null && _mvrcdList == null )
return;
for(int i=0; i<_fullrcdList.length; i++) {
int colID = _fullrcdList[i];
writeMetadata(_rcdMaps.get(colID), outputDir, colID, fs, agents, true);
}
}
/**
* Method to merge map output transformation metadata.
*
* @param values
* @return
* @throws IOException
*/
@Override
public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values, String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException {
HashMap<String, Long> map = new HashMap<String,Long>();
DistinctValue d = new DistinctValue();
String word = null;
Long count = null, val = null;
while(values.hasNext()) {
d.reset();
d = values.next();
word = d.getWord();
count = d.getCount();
val = map.get(word);
if(val == null)
map.put(word, count);
else
map.put(word, val+count);
}
writeMetadata(map, outputDir, colID, fs, agents, false);
}
// ------------------------------------------------------------------------------------------------
public HashMap<Integer, HashMap<String,Long>> getCPRecodeMaps() { return _rcdMaps; }
HashMap<Integer, HashMap<String,String>> _finalMaps = null;
public HashMap<Integer, HashMap<String,String>> getRecodeMaps() {
return _finalMaps;
}
/**
* Method to load recode maps of all attributes, at once.
*
* @param job
* @throws IOException
*/
@Override
public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException {
if ( _rcdList == null )
return;
_finalMaps = new HashMap<Integer, HashMap<String, String>>();
if(fs.isDirectory(txMtdDir)) {
for(int i=0; i<_rcdList.length;i++) {
int colID = _rcdList[i];
Path path = new Path( txMtdDir + "/Recode/" + agents.getName(colID) + RCD_MAP_FILE_SUFFIX);
TfUtils.checkValidInputFile(fs, path, true);
HashMap<String,String> map = new HashMap<String,String>();
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path)));
String line = null, word=null;
String rcdIndex = null;
// Example line to parse: "WN (1)67492",1,61975
while((line=br.readLine())!=null) {
// last occurrence of quotation mark
int idxQuote = line.lastIndexOf('"');
word = UtilFunctions.unquote(line.substring(0,idxQuote+1));
int idx = idxQuote+2;
while(line.charAt(idx) != TXMTD_SEP.charAt(0))
idx++;
rcdIndex = line.substring(idxQuote+2,idx);
map.put(word, rcdIndex);
}
br.close();
_finalMaps.put(colID, map);
}
}
else {
fs.close();
throw new RuntimeException("Path to recode maps must be a directory: " + txMtdDir);
}
}
/**
* Method to apply transformations.
*
* @param words
* @return
*/
@Override
public String[] apply(String[] words, TfUtils agents) {
if ( _rcdList == null )
return words;
for(int i=0; i < _rcdList.length; i++) {
int colID = _rcdList[i];
try {
words[colID-1] = _finalMaps.get(colID).get(UtilFunctions.unquote(words[colID-1].trim()));
} catch(NullPointerException e) {
System.err.println("Maps for colID="+colID + " may be null (map = " + _finalMaps.get(colID) + ")");
throw new RuntimeException(e);
}
}
return words;
}
/**
* Check if the given column ID is subjected to this transformation.
*
*/
public int isRecoded(int colID)
{
if(_rcdList == null)
return -1;
int idx = Arrays.binarySearch(_rcdList, colID);
return ( idx >= 0 ? idx : -1);
}
public String[] cp_apply(String[] words, TfUtils agents) {
if ( _rcdList == null )
return words;
String w = null;
for(int i=0; i < _rcdList.length; i++) {
int colID = _rcdList[i];
try {
w = UtilFunctions.unquote(words[colID-1].trim());
words[colID-1] = Long.toString(_rcdMaps.get(colID).get(w));
} catch(NullPointerException e) {
if(w.isEmpty() && agents.isNA("") )
throw new RuntimeException("Empty string (a missing value) in column ID " + colID + " is not handled. Consider adding an imputation method on this column.");
throw new RuntimeException("ColID="+colID + ", word=" + words[colID-1] + ", maps entry not found (map = " + _rcdMaps.get(colID) + ")");
}
}
return words;
}
public void printMaps() {
for(Integer k : _rcdMaps.keySet()) {
System.out.println("Column " + k);
HashMap<String,Long> map = _rcdMaps.get(k);
for(String w : map.keySet()) {
System.out.println(" " + w + " : " + map.get(w));
}
}
}
public void print() {
System.out.print("Recoding List: \n ");
for(int i : _rcdList) {
System.out.print(i + " ");
}
System.out.println();
}
}