/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
package edu.indiana.soic.ts.crunch;
import edu.indiana.soic.ts.utils.CleanMetric;
import edu.indiana.soic.ts.utils.Constants;
import edu.indiana.soic.ts.utils.TableUtils;
import edu.indiana.soic.ts.utils.VectorPoint;
import org.apache.crunch.*;
import org.apache.crunch.impl.mr.MRPipeline;
import org.apache.crunch.io.hbase.HBaseSourceTarget;
import org.apache.crunch.types.writable.Writables;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.text.ParseException;
import java.util.*;
import java.util.concurrent.TimeUnit;
public class CrunchVectorCalculater extends Configured implements Tool, Serializable {
private static final Logger log = LoggerFactory.getLogger(CrunchVectorCalculater.class);
private static String startDate;
private static String endDate;
private static int mode;
public static void main(final String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("mapreduce.output.textoutputformat.separator", ",");
final int res = ToolRunner.run(conf, new CrunchVectorCalculater(), args);
System.exit(res);
}
@Override
public int run(final String[] args) throws Exception {
try {
Configuration conf = getConf();
startDate = args[1];
endDate = args[2];
mode = Integer.valueOf(args[3]);
System.out.println("Start Date : " + startDate);
System.out.println("End Date : " + endDate);
if (startDate == null || startDate.isEmpty()) {
// set 1st starting date
startDate = "20040102";
}
if (endDate == null || endDate.isEmpty()) {
endDate = "20141231";
}
if (mode == 0){
mode = 5;
}
Configuration hbaseConfig = HBaseConfiguration.create();
TreeMap<String, List<Date>> genDates = TableUtils.genDates(TableUtils.getDate(startDate), TableUtils.getDate(endDate), 365, TimeUnit.DAYS, 7, 7, TimeUnit.DAYS);
PipelineResult result = null;
hbaseConfig.set("mapreduce.output.textoutputformat.separator", ",");
Pipeline pipeline = new MRPipeline(CrunchVectorCalculater.class, hbaseConfig);
for (String id : genDates.keySet()){
Scan scan = new Scan();
scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs
scan.setCacheBlocks(false); // don't set to true for MR jobs
List<Date> dates = genDates.get(id);
String start = TableUtils.convertDateToString(dates.get(0));
String end = TableUtils.convertDateToString(dates.get(1));
List<String> suitableDateList = TableUtils.getDates(start, end);
hbaseConfig.set(Constants.Job.NO_OF_DAYS, String.valueOf(suitableDateList.size()));
getConf().addResource(hbaseConfig);
for (String date : suitableDateList){
scan.addColumn(Constants.STOCK_TABLE_CF_BYTES, date.getBytes());
}
HBaseSourceTarget source = new HBaseSourceTarget(Constants.STOCK_TABLE_NAME, scan);
// Our source, in a format which can be use by crunch
PTable<ImmutableBytesWritable, Result> rawText = pipeline.read(source);
PTable<String, String> stringStringPTable = extractText(rawText);
pipeline.writeTextFile(stringStringPTable, Constants.HDFS_OUTPUT_PATH + id);
}
result = pipeline.done();
return result.succeeded() ? 0 : 1;
} catch (ParseException e) {
log.error("Error while parsing date", e);
throw new RuntimeException("Error while parsing date", e);
}
}
public PTable<String, String> extractText(final PTable<ImmutableBytesWritable, Result> tableContent) {
return tableContent.parallelDo("Read data", new DoFn<Pair<ImmutableBytesWritable, Result>, Pair<String, String>>() {
transient VectorPoint vectorPoint;
int noOfDays;
@Override
public void configure(Configuration conf) {
super.configure(conf);
noOfDays = Integer.valueOf(conf.get(Constants.Job.NO_OF_DAYS));
}
@Override
public void process(Pair<ImmutableBytesWritable, Result> row, Emitter<Pair<String, String>> emitter) {
NavigableMap<byte[], NavigableMap<byte[], NavigableMap<Long, byte[]>>> map = row.second().getMap();
// go through the column family
for (Map.Entry<byte[], NavigableMap<byte[], NavigableMap<Long, byte[]>>> columnFamilyMap : map.entrySet()) {
// go through the column
double totalCap = 0;
String rowKey = Bytes.toString(row.second().getRow());
String[] idKey = rowKey.split("_");
int id = Integer.valueOf(idKey[0]);
String symbol = idKey[1];
int index = 0;
vectorPoint = new VectorPoint(id, symbol, noOfDays, true);
for (Map.Entry<byte[], NavigableMap<Long, byte[]>> entryVersion : columnFamilyMap.getValue().entrySet()) {
for (Map.Entry<Long, byte[]> entry : entryVersion.getValue().entrySet()) {
String column = Bytes.toString(entryVersion.getKey());
byte[] val = entry.getValue();
String valOfColumn = new String(val);
System.out.println("RowKey : " + rowKey + " Column Key : " + column + " Column Val : " + valOfColumn);
if (!valOfColumn.isEmpty()) {
String[] priceAndCap = valOfColumn.split("_");
if (priceAndCap.length > 1) {
String pr = priceAndCap[0];
String cap = priceAndCap[1];
if (pr != null && !pr.equals("null")){
double price = Double.valueOf(pr);
vectorPoint.add(price, index);
index++;
}
if (cap != null && !cap.equals("null")){
totalCap += Double.valueOf(cap);
}
}
}
}
}
vectorPoint.setTotalCap(totalCap);
String serialize = null;
if(vectorPoint.cleanVector(new CleanMetric())){
serialize = vectorPoint.serialize();
System.out.println(serialize);
}
if (serialize != null){
emitter.emit(new Pair<String, String>(String.valueOf(id),serialize));
}
}
}
}, Writables.tableOf(Writables.strings(), Writables.strings()));
}
}