/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysml.runtime.transform;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.sysml.lops.Lop;
import org.apache.sysml.runtime.io.IOUtilFunctions;
import org.apache.sysml.runtime.matrix.data.FrameBlock;
import org.apache.sysml.runtime.matrix.data.MatrixBlock;
import org.apache.sysml.runtime.matrix.data.Pair;
import org.apache.sysml.runtime.transform.MVImputeAgent.MVMethod;
import org.apache.sysml.runtime.transform.decode.DecoderRecode;
import org.apache.sysml.runtime.transform.encode.Encoder;
import org.apache.sysml.runtime.transform.meta.TfMetaUtils;
import org.apache.sysml.runtime.util.UtilFunctions;
import org.apache.wink.json4j.JSONException;
import org.apache.wink.json4j.JSONObject;
public class RecodeAgent extends Encoder
{
private static final long serialVersionUID = 8213163881283341874L;
private int[] _mvrcdList = null;
private int[] _fullrcdList = null;
//recode maps and custom map for partial recode maps
private HashMap<Integer, HashMap<String, Long>> _rcdMaps = new HashMap<Integer, HashMap<String, Long>>();
private HashMap<Integer, HashMap<String,String>> _finalMaps = null;
private HashMap<Integer, HashSet<Object>> _rcdMapsPart = null;
public RecodeAgent(JSONObject parsedSpec, String[] colnames, int clen)
throws JSONException
{
super(null, clen);
int rcdCount = 0;
if( parsedSpec.containsKey(TfUtils.TXMETHOD_RECODE) ) {
int[] collist = TfMetaUtils.parseJsonIDList(parsedSpec, colnames, TfUtils.TXMETHOD_RECODE);
rcdCount = initColList(collist);
}
if ( parsedSpec.containsKey(TfUtils.TXMETHOD_MVRCD)) {
_mvrcdList = TfMetaUtils.parseJsonIDList(parsedSpec, colnames, TfUtils.TXMETHOD_MVRCD);
rcdCount += _mvrcdList.length;
}
if ( rcdCount > 0 ) {
_fullrcdList = new int[rcdCount];
int idx = -1;
if(_colList != null)
for(int i=0; i < _colList.length; i++)
_fullrcdList[++idx] = _colList[i];
if(_mvrcdList != null)
for(int i=0; i < _mvrcdList.length; i++)
_fullrcdList[++idx] = _mvrcdList[i];
}
}
public HashMap<Integer, HashMap<String,Long>> getCPRecodeMaps() {
return _rcdMaps;
}
public HashMap<Integer, HashSet<Object>> getCPRecodeMapsPartial() {
return _rcdMapsPart;
}
public HashMap<Integer, HashMap<String,String>> getRecodeMaps() {
return _finalMaps;
}
void prepare(String[] words, TfUtils agents) {
if ( _colList == null && _mvrcdList == null )
return;
String w = null;
for (int colID : _fullrcdList) {
w = UtilFunctions.unquote(words[colID-1].trim());
if(_rcdMaps.get(colID) == null )
_rcdMaps.put(colID, new HashMap<String, Long>());
HashMap<String, Long> map = _rcdMaps.get(colID);
Long count = map.get(w);
if(count == null)
map.put(w, new Long(1));
else
map.put(w, count+1);
}
}
private HashMap<String, Long> handleMVConstant(int colID, TfUtils agents, HashMap<String, Long> map)
{
MVImputeAgent mvagent = agents.getMVImputeAgent();
if ( mvagent.getMethod(colID) == MVMethod.CONSTANT )
{
// check if the "replacement" is part of the map. If not, add it.
String repValue = mvagent.getReplacement(colID);
if(repValue == null)
throw new RuntimeException("Expecting a constant replacement value for column ID " + colID);
repValue = UtilFunctions.unquote(repValue);
Long count = map.get(repValue);
long mvCount = agents.getValid() - mvagent.getNonMVCount(colID);
if(count == null)
map.put(repValue, mvCount);
else
map.put(repValue, count + mvCount);
}
return map;
}
/**
* Method to output transformation metadata from the mappers.
* This information is collected and merged by the reducers.
*/
@Override
public void mapOutputTransformationMetadata(OutputCollector<IntWritable, DistinctValue> out, int taskID, TfUtils agents) throws IOException {
mapOutputHelper(taskID, out, null, agents);
}
public ArrayList<Pair<Integer, DistinctValue>> mapOutputTransformationMetadata(int taskID, ArrayList<Pair<Integer, DistinctValue>> list, TfUtils agents) throws IOException {
mapOutputHelper(taskID, null, list, agents);
return list;
}
public void mapOutputHelper(int taskID, OutputCollector<IntWritable, DistinctValue> out, ArrayList<Pair<Integer, DistinctValue>> list, TfUtils agents) throws IOException {
if ( _colList == null && _mvrcdList == null )
return;
try
{
for(int i=0; i < _fullrcdList.length; i++)
{
int colID = _fullrcdList[i];
HashMap<String, Long> map = _rcdMaps.get(colID);
if(map != null)
{
map = handleMVConstant(colID, agents, map);
if ( out != null ) {
IntWritable iw = new IntWritable(colID);
for(String s : map.keySet())
out.collect(iw, new DistinctValue(s, map.get(s)));
}
else if ( list != null ) {
for(String s : map.keySet())
list.add(new Pair<Integer,DistinctValue>(colID, new DistinctValue(s, map.get(s))) );
}
}
}
} catch(Exception e) {
throw new IOException(e);
}
}
/**
* Function to output transformation metadata, including:
* - recode maps,
* - number of distinct values,
* - mode, and
* - imputation value (in the case of global_mode)
*
* The column for which this function is invoked can be one of the following:
* - just recoded (write .map, .ndistinct, .mode)
* - just mv imputed (w/ global_mode) (write .impute)
* - both recoded and mv imputed (write .map, .ndistinct, .mode, .impute)
*
* @param map recode maps
* @param outputDir output directory
* @param colID column id
* @param fs file system
* @param agents ?
* @param fromCP ?
* @throws IOException if IOException occurs
*/
private void writeMetadata(HashMap<String,Long> map, String outputDir, int colID, FileSystem fs, TfUtils agents, boolean fromCP) throws IOException {
// output recode maps and mode
MVImputeAgent mvagent = agents.getMVImputeAgent();
String mode = null;
Long count = null;
int rcdIndex = 0, modeIndex = 0;
long maxCount = Long.MIN_VALUE;
boolean isRecoded = (isApplicable(colID) != -1);
boolean isModeImputed = (mvagent.getMethod(colID) == MVMethod.GLOBAL_MODE);
Path pt=new Path(outputDir+"/Recode/"+ agents.getName(colID) + TfUtils.TXMTD_RCD_MAP_SUFFIX);
BufferedWriter br=null;
try {
if(isRecoded)
br = new BufferedWriter(new OutputStreamWriter(fs.create(pt,true)));
// remove NA strings
if ( agents.getNAStrings() != null)
for(String naword : agents.getNAStrings())
map.remove(naword);
if(fromCP)
map = handleMVConstant(colID, agents, map);
if ( map.size() == 0 )
throw new RuntimeException("Can not proceed since \"" + agents.getName(colID) + "\" (id=" + colID + ") contains only the missing values, and not a single valid value -- set imputation method to \"constant\".");
// Order entries by category (string) value
List<String> newNames = new ArrayList<String>(map.keySet());
Collections.sort(newNames);
for(String w : newNames) { //map.keySet()) {
count = map.get(w);
++rcdIndex;
// output (w, count, rcdIndex)
if(br != null)
br.write(UtilFunctions.quote(w) + TfUtils.TXMTD_SEP + rcdIndex + TfUtils.TXMTD_SEP + count + "\n");
if(maxCount < count) {
maxCount = count;
mode = w;
modeIndex = rcdIndex;
}
// Replace count with recode index (useful when invoked from CP)
map.put(w, (long)rcdIndex);
}
}
finally {
IOUtilFunctions.closeSilently(br);
}
if ( mode == null ) {
mode = "";
maxCount = 0;
}
if ( isRecoded )
{
// output mode
pt=new Path(outputDir+"/Recode/"+ agents.getName(colID) + TfUtils.MODE_FILE_SUFFIX);
try(BufferedWriter br2=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))) ) {
br2.write(UtilFunctions.quote(mode) + "," + modeIndex + "," + maxCount );
}
// output number of distinct values
pt=new Path(outputDir+"/Recode/"+ agents.getName(colID) + TfUtils.TXMTD_RCD_DISTINCT_SUFFIX);
try(BufferedWriter br2=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true))) ) {
br2.write(""+map.size());
}
}
if (isModeImputed)
{
pt=new Path(outputDir+"/Impute/"+ agents.getName(colID) + TfUtils.TXMTD_MV_FILE_SUFFIX);
try( BufferedWriter br2=new BufferedWriter(new OutputStreamWriter(fs.create(pt,true)))) {
br2.write(colID + "," + UtilFunctions.quote(mode));
}
}
}
public void outputTransformationMetadata(String outputDir, FileSystem fs, TfUtils agents) throws IOException {
if(_colList == null && _mvrcdList == null )
return;
for(int i=0; i<_fullrcdList.length; i++) {
int colID = _fullrcdList[i];
writeMetadata(_rcdMaps.get(colID), outputDir, colID, fs, agents, true);
}
}
/**
* Method to merge map output transformation metadata.
*/
@Override
public void mergeAndOutputTransformationMetadata(Iterator<DistinctValue> values, String outputDir, int colID, FileSystem fs, TfUtils agents) throws IOException {
HashMap<String, Long> map = new HashMap<String,Long>();
DistinctValue d = new DistinctValue();
String word = null;
Long count = null, val = null;
while(values.hasNext()) {
d.reset();
d = values.next();
word = d.getWord();
count = d.getCount();
val = map.get(word);
if(val == null)
map.put(word, count);
else
map.put(word, val+count);
}
writeMetadata(map, outputDir, colID, fs, agents, false);
}
/**
* Method to load recode maps of all attributes, at once.
*/
@Override
public void loadTxMtd(JobConf job, FileSystem fs, Path txMtdDir, TfUtils agents) throws IOException {
if( !isApplicable() )
return;
_finalMaps = new HashMap<Integer, HashMap<String, String>>();
if(fs.isDirectory(txMtdDir)) {
for(int i=0; i<_colList.length;i++) {
int colID = _colList[i];
Path path = new Path( txMtdDir + "/Recode/" + agents.getName(colID) + TfUtils.TXMTD_RCD_MAP_SUFFIX);
TfUtils.checkValidInputFile(fs, path, true);
HashMap<String,String> map = new HashMap<String,String>();
Pair<String,String> pair = new Pair<String,String>();
String line = null;
try( BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(path))) ) {
// Example line to parse: "WN (1)67492",1,61975
while((line=br.readLine())!=null) {
DecoderRecode.parseRecodeMapEntry(line, pair);
map.put(pair.getKey(), pair.getValue());
}
}
_finalMaps.put(colID, map);
}
}
else {
throw new RuntimeException("Path to recode maps must be a directory: " + txMtdDir);
}
}
private String lookupRCDMap(int colID, String key) {
if( _finalMaps!=null )
return _finalMaps.get(colID).get(key);
else { //used for cp
Long tmp = _rcdMaps.get(colID).get(key);
return (tmp!=null) ? Long.toString(tmp) : null;
}
}
@Override
public MatrixBlock encode(FrameBlock in, MatrixBlock out) {
if( !isApplicable() )
return out;
//build and apply recode maps
build(in);
apply(in, out);
return out;
}
@Override
public void build(FrameBlock in) {
if( !isApplicable() )
return;
Iterator<String[]> iter = in.getStringRowIterator();
while( iter.hasNext() ) {
String[] row = iter.next();
for( int j=0; j<_colList.length; j++ ) {
int colID = _colList[j]; //1-based
//allocate column map if necessary
if( !_rcdMaps.containsKey(colID) )
_rcdMaps.put(colID, new HashMap<String,Long>());
//probe and build column map
HashMap<String,Long> map = _rcdMaps.get(colID);
String key = row[colID-1];
if( key!=null && !key.isEmpty() && !map.containsKey(key) )
map.put(key, Long.valueOf(map.size()+1));
}
}
}
public void buildPartial(FrameBlock in) {
if( !isApplicable() )
return;
//ensure allocated partial recode map
if( _rcdMapsPart == null )
_rcdMapsPart = new HashMap<Integer, HashSet<Object>>();
//construct partial recode map (tokens w/o codes)
//iterate over columns for sequential access
for( int j=0; j<_colList.length; j++ ) {
int colID = _colList[j]; //1-based
//allocate column map if necessary
if( !_rcdMapsPart.containsKey(colID) )
_rcdMapsPart.put(colID, new HashSet<Object>());
HashSet<Object> map = _rcdMapsPart.get(colID);
//probe and build column map
for( int i=0; i<in.getNumRows(); i++ )
map.add(in.get(i, colID-1));
//cleanup unnecessary entries once
map.remove(null);
map.remove("");
}
}
/**
* Method to apply transformations.
*/
@Override
public String[] apply(String[] words)
{
if( !isApplicable() )
return words;
//apply recode maps on relevant columns of given row
for(int i=0; i < _colList.length; i++) {
//prepare input and get code
int colID = _colList[i];
String key = UtilFunctions.unquote(words[colID-1].trim());
String val = lookupRCDMap(colID, key);
// replace unseen keys with NaN
words[colID-1] = (val!=null) ? val : "NaN";
}
return words;
}
@Override
public MatrixBlock apply(FrameBlock in, MatrixBlock out) {
//apply recode maps column wise
for( int j=0; j<_colList.length; j++ ) {
int colID = _colList[j];
for( int i=0; i<in.getNumRows(); i++ ) {
Object okey = in.get(i, colID-1);
String key = (okey!=null) ? okey.toString() : null;
String val = lookupRCDMap(colID, key);
out.quickSetValue(i, colID-1, (val!=null) ?
Double.parseDouble(val) : Double.NaN);
}
}
return out;
}
@Override
public FrameBlock getMetaData(FrameBlock meta) {
if( !isApplicable() )
return meta;
//inverse operation to initRecodeMaps
//allocate output rows
int maxDistinct = 0;
for( int j=0; j<_colList.length; j++ )
if( _rcdMaps.containsKey(_colList[j]) )
maxDistinct = Math.max(maxDistinct, _rcdMaps.get(_colList[j]).size());
meta.ensureAllocatedColumns(maxDistinct);
//create compact meta data representation
for( int j=0; j<_colList.length; j++ ) {
int colID = _colList[j]; //1-based
int rowID = 0;
if( _rcdMaps.containsKey(_colList[j]) )
for( Entry<String, Long> e : _rcdMaps.get(colID).entrySet() ) {
String tmp = constructRecodeMapEntry(e.getKey(), e.getValue());
meta.set(rowID++, colID-1, tmp);
}
meta.getColumnMetadata(colID-1).setNumDistinct(
_rcdMaps.get(colID).size());
}
return meta;
}
/**
* Construct the recodemaps from the given input frame for all
* columns registered for recode.
*
* @param meta frame block
*/
public void initMetaData( FrameBlock meta ) {
if( meta == null || meta.getNumRows()<=0 )
return;
for( int j=0; j<_colList.length; j++ ) {
int colID = _colList[j]; //1-based
_rcdMaps.put(colID, meta.getRecodeMap(colID-1));
}
}
/**
* Returns the Recode map entry which consists of concatenation of code, delimiter and token.
* @param token is part of Recode map
* @param code is code for token
* @return the concatenation of code and token with delimiter in between
*/
public static String constructRecodeMapEntry(String token, Long code) {
return token + Lop.DATATYPE_PREFIX + code.toString();
}
}