/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
package edu.indiana.soic.ts.crunch.utils;
import edu.indiana.soic.ts.utils.Constants;
import org.apache.crunch.DoFn;
import org.apache.crunch.Emitter;
import org.apache.crunch.PCollection;
import org.apache.crunch.io.hbase.HBaseTypes;
import org.apache.crunch.types.writable.Writables;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.util.Bytes;
public class CrunchUtils {
public static PCollection<String> splitLines(PCollection<String> lines) {
return lines.parallelDo(new DoFn<String, String>() {
@Override
public void process(String line, Emitter<String> emitter) {
for (String word : line.split(",")) {
emitter.emit(word);
}
}
}, Writables.strings());
}
/**
* This method return collection of <Put> objects which can be inserted to HBase
* @param lines lines from the CSV file
* @return collection of <Put>
*/
public static PCollection<Put> returnRows(PCollection<String> lines) {
// This will work fine because the DoFn is defined inside of a static method.
return lines.parallelDo(new DoFn<String, Put>() {
@Override
public void process(String line, Emitter<Put> emitter) {
String id = null, symbol = null, date = null, cap = null, price = null, rowKey, rowVal;
String[] fields = line.split(",");
if (fields.length > 0 && fields[0] != null && !fields[0].equals("")) {
id = fields[0];
}
if (fields.length > 1 && fields[1] != null && !fields[1].equals("")) {
date = fields[1];
}
if (fields.length > 2 && fields[2] != null && !fields[2].equals("")) {
symbol = fields[2];
}
if (fields.length > 3 && fields[3] != null && !fields[3].equals("")) {
price = fields[3];
}
if (fields.length > 4 && fields[4] != null && !fields[4].equals("")) {
cap = fields[4];
}
if (id != null && symbol != null) {
rowKey = id + "_" + symbol;
rowVal = date + "_" + price + "_" + cap;
Put row = new Put(rowKey.getBytes());
String[] split = rowVal.split("_");
if (split.length > 2){
row.add(Constants.STOCK_TABLE_CF_BYTES, Bytes.toBytes(split[0]), Bytes.toBytes(split[1] + "_" + split[2]));
}else if (split.length > 1 && split.length < 2){
row.add(Constants.STOCK_TABLE_CF_BYTES, Bytes.toBytes(split[0]), Bytes.toBytes(split[1] + "_NAN" ));
}else if (split.length > 0 && split.length <1){
row.add(Constants.STOCK_TABLE_CF_BYTES, Bytes.toBytes(split[0]), Bytes.toBytes("NAN_NAN" ));
}
emitter.emit(row);
}
}
}, HBaseTypes.puts());
}
/**
* This method return collection of <Put> objects which can be inserted to HBase
* @param lines lines from the CSV file
* @return collection of <Put>
*/
public static PCollection<Put> returnDates(PCollection<String> lines) {
// This will work fine because the DoFn is defined inside of a static method.
return lines.parallelDo(new DoFn<String, Put>() {
@Override
public void process(String line, Emitter<Put> emitter) {
String date = null, rowKey = null, rowVal = null;
String[] fields = line.split(",");
if (fields.length > 1 && fields[1] != null && !fields[1].equals("")) {
date = fields[1];
}
if (date != null){
rowKey = date;
rowVal = date;
Put row = new Put(rowKey.getBytes());
row.add(Constants.STOCK_DATES_CF_BYTES, Bytes.toBytes(rowVal), Bytes.toBytes(rowVal));
emitter.emit(row);
}
}
}, HBaseTypes.puts());
}
}