/*
* avenir: Predictive analytic based on Hadoop Map Reduce
* Author: Pranab Ghosh
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.avenir.regress;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.List;
import java.util.UUID;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.avenir.explore.CramerCorrelation;
import org.chombo.util.FeatureSchema;
import org.chombo.util.Tuple;
import org.chombo.util.Utility;
import org.codehaus.jackson.map.ObjectMapper;
public class LogisticRegressionJob extends Configured implements Tool {
private static final String ITER_LIMIT = "iterLimit";
private static final String ALL_BELOW_THRESHOLD = "allBelowThreshold";
private static final String AVERAGE_BELOW_THRESHOLD = "averageBelowThreshold";
private static final int CONVERGED = 100;
private static final int NOT_CONVERGED = 101;
@Override
public int run(String[] args) throws Exception {
Job job = new Job(getConf());
String jobName = "Logistic regression";
job.setJobName(jobName);
job.setJarByClass(LogisticRegressionJob.class);
Utility.setConfiguration(job.getConfiguration(), "avenir");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(LogisticRegressionJob.RegressionMapper.class);
job.setReducerClass(LogisticRegressionJob.RegressionReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(job.getConfiguration().getInt("num.reducer", 1));
int status = job.waitForCompletion(true) ? 0 : 1;
Configuration conf = job.getConfiguration();
if (status == 0) {
status = checkConvergence(conf);
}
return status;
}
/**
* @param conf
* @return
* @throws IOException
*/
private int checkConvergence(Configuration conf) throws IOException {
int status = 0;
List<String> lines = Utility.getFileLines(conf, "coeff.file.path");
String convCriteria = conf.get("convergence.criteria", ITER_LIMIT);
if (convCriteria.equals(ITER_LIMIT)) {
int iterLimit = conf.getInt("iteration.limit", 10);
status = lines.size() < iterLimit ? NOT_CONVERGED : CONVERGED;
} else {
double[] prevCoeff = Utility.doubleArrayFromString(lines.get(lines.size()-2));
double[] curCoeff = Utility.doubleArrayFromString(lines.get(lines.size()-1));
LogisticRegressor regressor = new LogisticRegressor(prevCoeff);
regressor.setAggregates(curCoeff);
regressor.setConvergeThreshold((double)conf.getFloat("convergence.threshold", (float)5.0));
if (convCriteria.equals(ALL_BELOW_THRESHOLD)) {
status = regressor.isAllConverged() ? CONVERGED : NOT_CONVERGED;
} else if (convCriteria.equals(AVERAGE_BELOW_THRESHOLD)) {
status = regressor.isAverageConverged() ? CONVERGED : NOT_CONVERGED;
} else {
throw new IllegalArgumentException("Invalid convergence criteria:" + convCriteria);
}
}
return status;
}
/**
* @author pranab
*
*/
public static class RegressionMapper extends Mapper<LongWritable, Text, Text, Tuple> {
private String fieldDelimRegex;
private String[] items;
private Text outKey = new Text();
private Tuple outVal = new Tuple();
private FeatureSchema schema;
private int[] featureValues;
private int[] featureOrdinals;
private int classOrdinal;
private String classValue;
private int iterCount;
private double[] coefficients;
private LogisticRegressor regressor;
private static final Logger LOG = Logger.getLogger(RegressionMapper.class);
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context)
*/
protected void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
if (conf.getBoolean("debug.on", false)) {
LOG.setLevel(Level.DEBUG);
}
fieldDelimRegex = conf.get("field.delim.regex", ",");
InputStream fs = Utility.getFileStream(conf, "feature.schema.file.path");
ObjectMapper mapper = new ObjectMapper();
schema = mapper.readValue(fs, FeatureSchema.class);
//regression coefficients
List<String[]> lines = Utility.parseFileLines(conf, "coeff.file.path", fieldDelimRegex);
iterCount = lines.size();
String[] items = lines.get(lines.size() - 1);
coefficients = new double[items.length];
for (int i = 0; i < items.length; ++i) {
coefficients[i] = Double.parseDouble(items[i]);
}
String posClassVal = conf.get("positive.class.value");
regressor = new LogisticRegressor(coefficients, posClassVal);
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Mapper#cleanup(org.apache.hadoop.mapreduce.Mapper.Context)
*/
protected void cleanup(Context context) throws IOException, InterruptedException {
double[] aggregate = regressor.getAggregates();
for (int i = 0; i < aggregate.length; ++i) {
outVal.append(aggregate[i]);
}
outKey.set(UUID.randomUUID().toString());
context.write(outKey, outVal);
}
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
items = value.toString().split(fieldDelimRegex);
if (null == featureValues) {
featureOrdinals = schema.getFeatureFieldOrdinals();
featureValues = new int[featureOrdinals.length + 1];
featureValues[0] = 1;
classOrdinal = schema.findClassAttrField().getOrdinal();
}
for (int i=0; i < featureOrdinals.length; ++i) {
featureValues[i+1] = Integer.parseInt(items[featureOrdinals[i]]);
}
classValue = items[classOrdinal];
regressor.aggregate(featureValues, classValue);
}
}
/**
* @author pranab
*
*/
public static class RegressionReducer extends Reducer<Text, Tuple, NullWritable, Text> {
private FeatureSchema schema;
private LogisticRegressor regressor;
private double[] aggregate;
private String fieldDelimOut;
private Text outVal = new Text();
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Reducer#setup(org.apache.hadoop.mapreduce.Reducer.Context)
*/
protected void setup(Context context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
fieldDelimOut = conf.get("field.delim.out", ",");
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Reducer#cleanup(org.apache.hadoop.mapreduce.Reducer.Context)
*/
protected void cleanup(Context context) throws IOException, InterruptedException {
double[] aggregates = regressor.getAggregates();
StringBuilder stBld = new StringBuilder();
stBld.append(aggregates[0]);
for (int i = 1; i < aggregates.length; ++i ) {
stBld.append(fieldDelimOut).append(aggregates[i]);
}
outVal.set(stBld.toString());
Configuration conf = context.getConfiguration();
saveCoefficients( conf, stBld.toString());
}
/**
* @param conf
* @param newCoefficients
* @throws IOException
*/
private void saveCoefficients(Configuration conf, String newCoefficients) throws IOException {
List<String> lines = Utility.getFileLines(conf, "coeff.file.path");
lines.add(newCoefficients);
//delete file
FileSystem fs = FileSystem.get(conf);
Path filenamePath = new Path(conf.get("coeff.file.path"));
fs.delete(filenamePath, true);
//recreate with new data
OutputStream os = fs.create( filenamePath);
BufferedWriter br = new BufferedWriter( new OutputStreamWriter( os, "UTF-8" ) );
for (String line : lines) {
br.write(line + "\n");
}
br.close();
fs.close();
}
/* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Reducer#reduce(KEYIN, java.lang.Iterable, org.apache.hadoop.mapreduce.Reducer.Context)
*/
protected void reduce(Text key, Iterable<Tuple> values, Context context)
throws IOException, InterruptedException {
for (Tuple value : values) {
if (null == regressor) {
regressor = new LogisticRegressor();
aggregate = new double[value.getSize()];
}
for (int i = 0; i < value.getSize(); ++i) {
aggregate[i] = value.getDouble(i);
}
regressor.addAggregates(aggregate);
}
}
}
/**
* @param args
*/
public static void main(String[] args) throws Exception {
int exitCode = NOT_CONVERGED;
int iterCount = 1;
do {
System.out.println("job iteration count:" + iterCount);
exitCode = ToolRunner.run(new LogisticRegressionJob(), args);
++iterCount;
} while (exitCode == NOT_CONVERGED);
System.exit(exitCode);
}
}