import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.CompareFilter;
import org.apache.hadoop.hbase.filter.RegexStringComparator;
import org.apache.hadoop.hbase.filter.RowFilter;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.godhuli.rhipe.RHRaw;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
public class LR {
static class LinearRegressionMapper extends TableMapper<IntWritable, FloatWritable> {
private Path[] localFiles;
FileInputStream fis = null;
BufferedInputStream bis = null;
private static final IntWritable KEY = new IntWritable(1);
private static byte [] TS = new byte[4];
private final FloatWritable VALUE = new FloatWritable(1);
public void configure(Configuration job)
{
/**
* Read the distributed cache
*/
try {
localFiles = DistributedCache.getLocalCacheFiles(job);
} catch (IOException e) {
e.printStackTrace();
}
}
public void map(ImmutableBytesWritable row, Result values, Context context) throws IOException {
/**
*
* Linear-Regression costs function
*
* This will simply sum over the subset and calculate the predicted value y_predict(x) for the given features values and the current theta values
* Then it will subtract the true y values from the y_predict(x) value for every input record in the subset
*
* J(theta) = sum((y_predict(x)-y)^2)
* y_predict(x) = theta(0)*x(0) + .... + theta(i)*x(i)
*
*/
System.arraycopy(row.get(), 3, TS, 0, 4);
for (KeyValue kv : values.raw()) {
final short delta = (short) (( org.apache.hadoop.hbase.util.Bytes.toShort(kv.getQualifier()) & 0xFFFF) >>> 4);
int timestamp = org.apache.hadoop.hbase.util.Bytes.toInt(TS)+ delta;
//System.out.print("\n Base Timestamp as Rowkey => "+ org.apache.hadoop.hbase.util.Bytes.toInt(TS) + " -- Timestamp " + timestamp + "\n");
// System.out.print("Value " + org.apache.hadoop.hbase.util.Bytes.toFloat(kv.getValue() )+ "\n");
/**
* read the values and convert them to floats
*/
List<Float> val = new ArrayList<Float>();
val.add(org.apache.hadoop.hbase.util.Bytes.toFloat(kv.getValue()));
//VALUE.set(org.apache.hadoop.hbase.util.Bytes.toFloat(kv.getValue()));
System.out.print("Value " + org.apache.hadoop.hbase.util.Bytes.toFloat(kv.getValue() )+ "\n");
/**
* calculate the costs
*
*/
//context.write(KEY, new FloatWritable(costs(val)));
//KEY.set(timestamp);
try {
//context.write(KEY,VALUE); //Define key as 1 same for every values
context.write(KEY, new FloatWritable(costs(val)));
}
catch (InterruptedException e) {
throw new IOException(e);
}
//numRecords++;
}
}
private final float costs(List<Float> values)
{
/**
* Load the cache files
*/
File file = new File("/home/bikash/repos/r2time/examples/theta.csv");
float costs = 0;
try {
FileInputStream fis = new FileInputStream(file);
BufferedInputStream bis = new BufferedInputStream(fis);
BufferedReader d = new BufferedReader(new InputStreamReader(bis));
String line = d.readLine();
//all right we have all the theta values, lets convert them to floats
String[] theta = line.split(",");
//first value is the y value
float y = values.get(0);
/**
* Calculate the costs for each record in values
*/
for(int j = 0; j < values.size(); j++)
{
//bias calculation
if(j == 0)
costs += (new Float(theta[j]))*1;
else
costs += (new Float(theta[j]))*values.get(j);
}
// Subtract y and square the costs
costs = (costs -y)*(costs - y);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return costs;
}
}
public static class LinearRegressionReducer extends TableReducer <IntWritable, FloatWritable, ImmutableBytesWritable>
{
public static final byte[] CF = "cf".getBytes();
public static final byte[] COUNT = "count".getBytes();
@Override
public void reduce(IntWritable key, Iterable<FloatWritable> value, Context context)
throws IOException, InterruptedException {
/**
* The reducer just has to sum all the values for a given key
*
*/
float sum = 0;
int count =0;
for (FloatWritable val : value) {
sum += val.get();
count++;
}
Put put = new Put(org.apache.hadoop.hbase.util.Bytes.toBytes(key.get()));
System.out.print("Sum value => " + sum+ "\n");
put.add(org.apache.hadoop.hbase.util.Bytes.toBytes("number"), org.apache.hadoop.hbase.util.Bytes.toBytes(""), org.apache.hadoop.hbase.util.Bytes.toBytes(sum));
context.write(null, put);
//output.collect(key, new FloatWritable(sum));
}
}
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
Configuration conf = HBaseConfiguration.create();
String zookeeperQuorum = "haisen24.ux.uis.no";
String HBaseMaster = "haisen23.ux.uis.no:60000";
// String zookeeperQuorum = "localhost";
// String HBaseMaster = "localhost:60000";
conf.set("hbase.zookeeper.quorum", zookeeperQuorum);
conf.set("hbase.zookeeper.property.clientPort", "2181");
// conf.set("mapred.job.tracker", "haisen22.ux.uis.no:8021");
// conf.set("fs.default.name", "hdfs://haisen20.ux.uis.no:8020");
conf.set("hbase.master", HBaseMaster);
Job job = new Job(conf, "Linear Regression");
job.setJarByClass(LR.class);
/**
* Try to load the theta values into the distributed cache
*/
try {
//make sure this is your path to the cache file in the hadoop file system
DistributedCache.addCacheFile(new URI("/home/bikash/repos/r2time/examples/theta.csv"), conf);
} catch (URISyntaxException e1) {
e1.printStackTrace();
}
/***create scan object ***/
DataType dt = new DataType();
dt.setHbaseClient(zookeeperQuorum);
String[] tagk = {"1","host"};
String[] tagv = {"1","*"};
String[] val = DataType.getRowkeyFilter("1980/01/01-00:00:00","2014/02/22-10:00:00", "r2time.stress.test", tagk, tagv);
Scan scans = new Scan();
scans.setStartRow(org.apache.commons.codec.binary.Base64.decodeBase64(val[0]));
scans.setStopRow(org.apache.commons.codec.binary.Base64.decodeBase64(val[1]));
RowFilter rowFilterRegex = new RowFilter(CompareFilter.CompareOp.EQUAL,
new RegexStringComparator( org.apache.hadoop.hbase.util.Bytes.toString(org.apache.commons.codec.binary.Base64.decodeBase64(val[2]))));
scans.setFilter(rowFilterRegex);
scans.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs
scans.setCacheBlocks(false);
//job.setOutputKeyClass(RHRaw.class);
//job.setOutputValueClass(RHResult.class);
job.setInputFormatClass(RHHBaseRecorder.class);
String columns = "t:"; // comma seperated
scans.addColumns(columns);
//job.setMapperClass(Mapper1.class);
TableMapReduceUtil.initTableMapperJob("tsdb", scans, LinearRegressionMapper.class, IntWritable.class,FloatWritable.class, job);
job.setReducerClass(LinearRegressionReducer.class);
//job.setNumReduceTasks(1); // at least one, adjust as required
TableMapReduceUtil.initTableReducerJob("out", LinearRegressionReducer.class, job);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}