/*
* avenir: Predictive analytic based on Hadoop Map Reduce
* Author: Pranab Ghosh
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.avenir.explore;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.avenir.util.ContingencyMatrix;
import org.chombo.mr.FeatureField;
import org.chombo.util.FeatureSchema;
import org.chombo.util.Tuple;
import org.chombo.util.Utility;
import org.codehaus.jackson.map.ObjectMapper;
/**
* @author pranab
* Base class for categorical attribute correlation mapper and reducer
*
*/
public class CategoricalCorrelation {
/**
* @author pranab
*
*/
public static class CorrelationMapper extends Mapper<LongWritable, Text, Tuple, Text> {
private String fieldDelimRegex;
private String[] items;
private Text outVal = new Text();
private FeatureSchema schema;
private int[] sourceAttrs;
private int[] destAttrs;
private Map<Tuple, ContingencyMatrix> contMatrices = new HashMap<Tuple, ContingencyMatrix>();
private List<FeatureField> srcFields = new ArrayList<FeatureField>();
private List<FeatureField> dstFields = new ArrayList<FeatureField>();
private List<Tuple> attrPairs = new ArrayList<Tuple>();
private static final Logger LOG = Logger.getLogger(CorrelationMapper.class);
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context)
*/
protected void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
if (conf.getBoolean("debug.on", false)) {
LOG.setLevel(Level.DEBUG);
}
fieldDelimRegex = conf.get("field.delim.regex", ",");
InputStream fs = Utility.getFileStream(conf, "cac.feature.schema.file.path");
ObjectMapper mapper = new ObjectMapper();
schema = mapper.readValue(fs, FeatureSchema.class);
sourceAttrs = Utility.intArrayFromString(conf.get("cac.first.set.attributes"), ",");
destAttrs = Utility.intArrayFromString(conf.get("cac.second.set..attributes"), ",");
//initialize contingency matrix for all source attribute and target attribute pair
FeatureField srcField = null;
FeatureField dstField = null;
int srcSize = 0;
int dstSize = 0;
boolean dstFiledsInitialized = false;
for (int src : sourceAttrs) {
srcField = schema.findFieldByOrdinal(src);
srcSize = srcField.getCardinality().size();
srcFields.add(srcField);
for (int dst : destAttrs) {
LOG.debug("attr ordinals:" + src + " " + dst);
if (src != dst) {
dstField = schema.findFieldByOrdinal(dst);
dstSize = dstField.getCardinality().size();
LOG.debug("attr cardinality:" + srcSize + " " + dstSize);
Tuple key = new Tuple();
key.add(src, dst);
ContingencyMatrix value = new ContingencyMatrix(srcSize, dstSize);
contMatrices.put(key, value);
if (!dstFiledsInitialized) {
dstFields.add(dstField);
}
attrPairs.add(key);
}
}
dstFiledsInitialized = true;
}
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Mapper#cleanup(org.apache.hadoop.mapreduce.Mapper.Context)
*/
protected void cleanup(Context context) throws IOException, InterruptedException {
for (Tuple keyVal : attrPairs) {
ContingencyMatrix contMat = contMatrices.get(keyVal);
outVal.set(contMat.serialize());
context.write(keyVal, outVal);
}
}
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
items = value.toString().split(fieldDelimRegex);
//update contingency matrix for attribute pair
String srcVal = null;
String dstVal = null;
ContingencyMatrix contMat = null;
int attPairIndex = 0;
for (FeatureField srcField : srcFields) {
srcVal = items[srcField.getOrdinal()];
int srcIndex = srcField.cardinalityIndex(srcVal);
for (FeatureField dstField : dstFields) {
dstVal = items[dstField.getOrdinal()];
int dstIndex = dstField.cardinalityIndex(dstVal);
contMat = contMatrices.get(attrPairs.get(attPairIndex++));
contMat.increment(srcIndex, dstIndex);
}
}
}
}
/**
* @author pranab
*
*/
public static abstract class CorrelationReducer extends Reducer<Tuple, Text, NullWritable, Text> {
private FeatureSchema schema;
private FeatureField srcField = null;
private FeatureField dstField = null;
private int srcSize = 0;
private int dstSize = 0;
protected ContingencyMatrix contMat;
private Text outVal = new Text();
private ContingencyMatrix thisContMat = new ContingencyMatrix();
private String fieldDelim;
private int corrScale;
private static final Logger LOG = Logger.getLogger(CorrelationReducer.class);
protected void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
if (conf.getBoolean("debug.on", false)) {
LOG.setLevel(Level.DEBUG);
}
InputStream fs = Utility.getFileStream(context.getConfiguration(), "cac.feature.schema.file.path");
ObjectMapper mapper = new ObjectMapper();
schema = mapper.readValue(fs, FeatureSchema.class);
fieldDelim = conf.get("field.delim.out", ",");
corrScale = context.getConfiguration().getInt("cac.correlation.scale", 1000);
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
*/
protected void reduce(Tuple key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
srcField = schema.findFieldByOrdinal(key.getInt(0));
dstField = schema.findFieldByOrdinal(key.getInt(1));
srcSize = srcField.getCardinality().size();
dstSize = dstField.getCardinality().size();
contMat = new ContingencyMatrix(srcSize, dstSize);
LOG.debug("attr pairs:" + key.getInt(0) + " " + key.getInt(1));
thisContMat.initialize(srcSize, dstSize);
for (Text value : values) {
LOG.debug("cont matrix:" + value.toString() );
thisContMat.deseralize(value.toString());
contMat.aggregate(thisContMat);
}
outVal.set(srcField.getName() + fieldDelim +dstField.getName() + fieldDelim + getCorrelationStat());
context.write(NullWritable.get(),outVal);
}
/**
* @return
* Return the specific stat. Has to be defined in the extended class
*/
protected abstract double getCorrelationStat();
}
}