/* * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. * */ package edu.indiana.soic.ts.crunch; import com.google.protobuf.ServiceException; import edu.indiana.soic.ts.utils.Constants; import org.apache.commons.math.stat.regression.SimpleRegression; import org.apache.crunch.*; import org.apache.crunch.fn.Aggregators; import org.apache.crunch.impl.mr.MRPipeline; import org.apache.crunch.io.hbase.HBaseSourceTarget; import org.apache.crunch.io.hbase.HBaseTarget; import org.apache.crunch.io.hbase.HBaseTypes; import org.apache.crunch.types.writable.Writables; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.hbase.*; import org.apache.hadoop.hbase.client.*; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.Serializable; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.*; public class CrunchDataReader extends Configured implements Tool, Serializable { private static final Logger log = LoggerFactory.getLogger(CrunchDataReader.class); private static String startDate; private static String endDate; public static void main(final String[] args) throws Exception { final int res = ToolRunner.run(new Configuration(), new CrunchDataReader(), args); System.exit(res); } @Override public int run(final String[] args) throws Exception { try { startDate = args[1]; endDate = args[2]; System.out.println("Start Date : " + startDate); System.out.println("End Date : " + endDate); if (startDate == null || startDate.isEmpty()) { // set 1st starting date startDate = "20040102"; } if (endDate == null || endDate.isEmpty()) { endDate = "20141231"; } Configuration config = HBaseConfiguration.create(); Pipeline pipeline = new MRPipeline(CrunchDataReader.class, config); Scan scan = new Scan(); scan.setCaching(500); // 1 is the default in Scan, which will be bad for MapReduce jobs scan.setCacheBlocks(false); // don't set to true for MR jobs List<String> suitableDates = getDates(); if (suitableDates != null && !suitableDates.isEmpty()){ for (String date : suitableDates){ scan.addColumn(Constants.STOCK_TABLE_CF_BYTES, date.getBytes()); } } createTable(); // Our hbase source HBaseSourceTarget source = new HBaseSourceTarget(Constants.STOCK_TABLE_NAME, scan); // Our source, in a format which can be use by crunch PTable<ImmutableBytesWritable, Result> rawText = pipeline.read(source); PTable<String, String> stringStringPTable = extractText(rawText); PTable<String, String> result1 = stringStringPTable.groupByKey() .combineValues(Aggregators.STRING_CONCAT(" ", true)); // We create the collection of puts from the concatenated datas PCollection<Put> resultPut = createPut(result1); // We write the puts in hbase, in the target table pipeline.write(resultPut, new HBaseTarget(Constants.REGRESSION_TABLE_NAME)); PipelineResult result = pipeline.done(); return result.succeeded() ? 0 : 1; } catch (ParseException e) { log.error("Error while parsing date", e); throw new RuntimeException("Error while parsing date", e); } } private static void createTable() throws Exception { try { Configuration configuration = HBaseConfiguration.create(); HBaseAdmin.checkHBaseAvailable(configuration); Connection connection = ConnectionFactory.createConnection(configuration); // Instantiating HbaseAdmin class Admin admin = connection.getAdmin(); // Instantiating table descriptor class HTableDescriptor stockTableDesc = new HTableDescriptor(TableName.valueOf(Constants.REGRESSION_TABLE_NAME)); // Adding column families to table descriptor HColumnDescriptor stock_0414 = new HColumnDescriptor(Constants.REGRESSION_TABLE_CF); stockTableDesc.addFamily(stock_0414); // Execute the table through admin if (!admin.tableExists(stockTableDesc.getTableName())) { admin.createTable(stockTableDesc); System.out.println("Stock table created !!!"); } // Load hbase-site.xml HBaseConfiguration.addHbaseResources(configuration); } catch (ServiceException e) { log.error("Error occurred while creating HBase tables", e); throw new Exception("Error occurred while creating HBase tables", e); } } public PCollection<Put> createPut(PTable<String, String> extractedText) { return extractedText.parallelDo("Convert to puts", new DoFn<Pair<String, String>, Put>() { @Override public void process(Pair<String, String> input, Emitter<Put> emitter) { Put put = new Put(Bytes.toBytes(input.first())); put.add(Constants.REGRESSION_TABLE_CF.getBytes(), Constants.REGRESSION_TABLE_QUALIFIER.getBytes(), Bytes.toBytes(input.second())); emitter.emit(put); } }, HBaseTypes.puts()); } private void getRows(Scan scan, List<String> suitableDates) throws ServiceException, IOException { Configuration configuration = HBaseConfiguration.create(); HBaseConfiguration.addHbaseResources(configuration); HBaseAdmin.checkHBaseAvailable(configuration); Connection connection = ConnectionFactory.createConnection(configuration); // Instantiating HbaseAdmin class Admin admin = connection.getAdmin(); HTableDescriptor[] tableDescriptor = admin.listTables(); for (HTableDescriptor aTableDescriptor : tableDescriptor) { if (aTableDescriptor.getTableName().getNameAsString().equals(Constants.STOCK_TABLE_NAME)) { Table table = connection.getTable(aTableDescriptor.getTableName()); ResultScanner scanner = table.getScanner(scan); printRows(scanner, suitableDates); } } } public static void printRows(ResultScanner resultScanner, List<String> allDates) { for (Result aResultScanner : resultScanner) { printRow(aResultScanner, allDates); } } public static void printRow(Result result, List<String> allDates) { try { String rowName = Bytes.toString(result.getRow()); //if you want to get the entire row for (String date : allDates){ byte[] value = result.getValue(Constants.STOCK_TABLE_CF_BYTES, date.getBytes()); if (value != null){ System.out.println("Row Name : " + rowName + " : values : " + new String(value) ); } } } catch (Exception e) { e.printStackTrace(); } } public static List<String> getDates() throws ParseException { List<String> allDates = new ArrayList<String>(); Date startDate = getDate(CrunchDataReader.startDate); Date endDate = getDate(CrunchDataReader.endDate); ResultScanner scannerForDateTable = getScannerForDateTable(); for (Result aResultScanner : scannerForDateTable) { String date = new String(aResultScanner.getRow()); Date rowDate = getDate(date); if (startDate.compareTo(rowDate) * rowDate.compareTo(endDate) > 0){ allDates.add(date); } } return allDates; } public static Date getDate (String date) throws ParseException { DateFormat df = new SimpleDateFormat("yyyyMMdd"); return df.parse(date); } private static ResultScanner getScannerForDateTable() { try { Configuration configuration = HBaseConfiguration.create(); HBaseConfiguration.addHbaseResources(configuration); HBaseAdmin.checkHBaseAvailable(configuration); Connection connection = ConnectionFactory.createConnection(configuration); // Instantiating HbaseAdmin class Admin admin = connection.getAdmin(); HTableDescriptor[] tableDescriptor = admin.listTables(); // printing all the table names. for (HTableDescriptor aTableDescriptor : tableDescriptor) { if (aTableDescriptor.getTableName().getNameAsString().equals(Constants.STOCK_DATES_TABLE)) { Table table = connection.getTable(aTableDescriptor.getTableName()); Scan scan = new Scan(); scan.setCaching(20); scan.addFamily(Constants.STOCK_DATES_CF_BYTES); return table.getScanner(scan); } } } catch (ServiceException e) { log.error("Error while reading Stock Dates table", e); } catch (MasterNotRunningException e) { log.error("Error while reading Stock Dates table", e); } catch (ZooKeeperConnectionException e) { log.error("Error while reading Stock Dates table", e); } catch (IOException e) { log.error("Error while reading Stock Dates table", e); } return null; } public PTable<String, String> extractText(PTable<ImmutableBytesWritable, Result> tableContent) { return tableContent.parallelDo("Read data", new DoFn<Pair<ImmutableBytesWritable, Result>, Pair<String, String>>() { @Override public void process(Pair<ImmutableBytesWritable, Result> row, Emitter<Pair<String, String>> emitter) { SimpleRegression regression; NavigableMap<byte[], NavigableMap<byte[], NavigableMap<Long, byte[]>>> map = row.second().getMap(); System.out.println(map.size()); for (Map.Entry<byte[], NavigableMap<byte[], NavigableMap<Long, byte[]>>> columnFamilyMap : map.entrySet()) { regression = new SimpleRegression(); int count = 1; for (Map.Entry<byte[], NavigableMap<Long, byte[]>> entryVersion : columnFamilyMap.getValue().entrySet()) { for (Map.Entry<Long, byte[]> entry : entryVersion.getValue().entrySet()) { String rowKey = Bytes.toString(row.second().getRow()); String column = Bytes.toString(entryVersion.getKey()); byte[] val = entry.getValue(); String valOfColumn = new String(val); System.out.println("RowKey : " + rowKey + " Column Key : " + column + " Column Val : " + valOfColumn); if (!valOfColumn.isEmpty()) { String[] priceAndCap = valOfColumn.split("_"); if (priceAndCap.length > 1) { String pr = priceAndCap[0]; if (pr != null && !pr.equals("null")){ double price = Double.valueOf(pr); if (price < 0) { price = price - 2 * price; } System.out.println("Price : " + price + " count : " + count); regression.addData(count, price); } } } } count++; } // displays intercept of regression line System.out.println("Intercept : " + regression.getIntercept()); // displays slope of regression line System.out.println("Slope : " + regression.getSlope()); // displays slope standard error System.out.println("Slope STD Error : " + regression.getSlopeStdErr()); emitter.emit(new Pair<String, String>(String.valueOf(regression.getIntercept()), String.valueOf(regression.getSlope()))); } } }, Writables.tableOf(Writables.strings(), Writables.strings())); } }