/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/
package edu.indiana.soic.ts.streaming.dataflow;
import com.google.cloud.dataflow.sdk.Pipeline;
import com.google.cloud.dataflow.sdk.io.TextIO;
import com.google.cloud.dataflow.sdk.options.Default;
import com.google.cloud.dataflow.sdk.options.Description;
import com.google.cloud.dataflow.sdk.options.PipelineOptions;
import com.google.cloud.dataflow.sdk.options.PipelineOptionsFactory;
import com.google.cloud.dataflow.sdk.transforms.Combine;
import com.google.cloud.dataflow.sdk.transforms.DoFn;
import com.google.cloud.dataflow.sdk.transforms.GroupByKey;
import com.google.cloud.dataflow.sdk.transforms.ParDo;
import com.google.cloud.dataflow.sdk.transforms.windowing.SlidingWindows;
import com.google.cloud.dataflow.sdk.transforms.windowing.Window;
import com.google.cloud.dataflow.sdk.values.KV;
import com.google.cloud.dataflow.sdk.values.PCollection;
import com.google.cloud.dataflow.sdk.values.PCollectionView;
import edu.indiana.soic.ts.streaming.dataflow.utils.DistanceMatrix;
import edu.indiana.soic.ts.streaming.dataflow.utils.StockPricePoint;
import edu.indiana.soic.ts.streaming.dataflow.utils.SymbolEncoder;
import org.joda.time.Duration;
import org.joda.time.Instant;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.*;
public class StockAnalysisPipeline1 {
private final static Logger logger = LoggerFactory.getLogger(StockAnalysisPipeline1.class);
public static int WINDOW_LENGTH = 360;
public static int SLIDING_INTERVAL = 360;
public static interface StockAnalysisPipelineOptions extends PipelineOptions {
@Description("Path to input file")
@Default.String("")
String getInputFilePath();
void setInputFilePath(String value);
@Description("Output file path")
@Default.String("")
String getOutputFilePath();
void setOutputFilePath(String value);
}
public static void main(String[] args) throws IOException {
final SymbolEncoder symbolEncoder = new SymbolEncoder();
StockAnalysisPipelineOptions options = PipelineOptionsFactory.fromArgs(args).as(StockAnalysisPipelineOptions.class);
Pipeline pipeline = Pipeline.create(options);
//Reading and time stamping the stock prices
PCollection<KV<Integer, StockPricePoint>> stockPrices = pipeline.apply(TextIO.Read.named("Reading Input File").from(options.getInputFilePath()))
.apply(ParDo.named("Timestamping").of(new DoFn<String, KV<Integer, StockPricePoint>>() {
@Override
public void processElement(ProcessContext c) throws Exception {
try{
String[] fields = c.element().split(",");
StockPricePoint stockPoint = new StockPricePoint();
stockPoint.setId(fields[0]);
SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
stockPoint.setDate(sdf.parse(fields[1].trim()));
stockPoint.setSymbol(fields[2]);
stockPoint.setPrice(Double.parseDouble(fields[5].trim()));
stockPoint.setCap(Double.parseDouble(fields[6].trim()));
Instant instant = new Instant(stockPoint.getDate().getTime());
//debugging - we cannot handle large amounts of data when using local runner
//int index = symbolEncoder.getSymbolIndex(stockPoint.getSymbol());
//if(index > 1000 && index < 1100)
c.outputWithTimestamp(KV.of(symbolEncoder.getSymbolIndex(stockPoint.getSymbol()), stockPoint), instant);
}catch (Exception ex){
//input format issue
}
}
}));
//creating the sliding windows
PCollection<KV<Integer, StockPricePoint>> slidingWindowStockPrices = stockPrices.apply(
Window.named("Windowing").<KV<Integer, StockPricePoint>>into(
SlidingWindows.of(Duration.standardDays(WINDOW_LENGTH)).every(Duration.standardDays(SLIDING_INTERVAL))
)
);
//combining stock prices per company per window
PCollection<KV<Integer,List<StockPricePoint>>> stockPricesPerCompanyPerWindow = slidingWindowStockPrices
.apply(GroupByKey.create()).apply(ParDo.named("Combining By Company").of(new DoFn<KV<Integer,Iterable<StockPricePoint>>,
KV<Integer,List<StockPricePoint>>>() {
@Override
public void processElement(ProcessContext c) throws Exception {
Integer key = c.element().getKey();
Iterator<StockPricePoint> iterator = c.element().getValue().iterator();
List<StockPricePoint> stockPricePoints = new ArrayList<>();
while(iterator.hasNext()){
stockPricePoints.add(iterator.next());
}
c.output(KV.of(key, stockPricePoints));
}
}));
//accumulating companies per window
PCollectionView<Set<Integer>> companiesPerWindow = slidingWindowStockPrices.apply(Combine.globally(
new Combine.CombineFn<KV<Integer, StockPricePoint>, Set<Integer>, Set<Integer>>() {
@Override
public Set<Integer> createAccumulator() {
return new HashSet<>();
}
@Override
public Set<Integer> addInput(Set<Integer> indices, KV<Integer, StockPricePoint> integerStockPricePointKV) {
indices.add(integerStockPricePointKV.getKey());
return indices;
}
@Override
public Set<Integer> mergeAccumulators(Iterable<Set<Integer>> iterable) {
HashSet<Integer> indices = new HashSet<>();
Iterator<Set<Integer>> iterator = iterable.iterator();
while (iterator.hasNext()) {
indices.addAll(iterator.next());
}
return indices;
}
@Override
public Set<Integer> extractOutput(Set<Integer> indices) {
return indices;
}
}).named("Combine By Window").asSingletonView());
//duplicate the company entries in each window to create distance matrix entries
PCollection<KV<String,List<StockPricePoint>>> explodedEntries = stockPricesPerCompanyPerWindow.apply(
ParDo.named("Duplicating Entries").withSideInputs(companiesPerWindow).of(new DoFn<KV<Integer, List<StockPricePoint>>,
KV<String, List<StockPricePoint>>>() {
@Override
public void processElement(ProcessContext c) throws Exception {
Set<Integer> indices = c.sideInput (companiesPerWindow);
Integer key = c.element().getKey();
List<StockPricePoint> stockPricePoints = c.element().getValue();
Iterator<Integer> iterator = indices.iterator();
while(iterator.hasNext()){
Integer temp = iterator.next();
// we generate only the lower half. The distance matrix is symmetric
if(key > temp) {
c.output(KV.of(key + "_" + temp, stockPricePoints));
}else if (temp > key){
c.output(KV.of(temp + "_" + key, stockPricePoints));
}
}
}
}));
//grouping two entries to create a distance entry in the matrix and calculating the distance
PCollection<KV<String,Double>> distances = explodedEntries.apply(GroupByKey.create()).apply(
ParDo.named("Calculate Distances").of(new DoFn<KV<String,Iterable<List<StockPricePoint>>>, KV<String,Double>>() {
@Override
public void processElement(ProcessContext processContext) throws Exception {
Integer keyX = Integer.parseInt(processContext.element().getKey().split("_")[0]);
Integer keyY = Integer.parseInt(processContext.element().getKey().split("_")[1]);
Iterator<List<StockPricePoint>> iterator = processContext.element().getValue().iterator();
List<StockPricePoint> stockPricesX = iterator.next();
List<StockPricePoint> stockPricesY = iterator.next();
//TODO calculate distance
processContext.output(KV.of(keyX+"_"+keyY, 0.0));
}
}));
//formulate the distance matrix
PCollection<DistanceMatrix> distanceMatrix = distances.apply(Combine.globally(
new Combine.CombineFn<KV<String,Double>, DistanceMatrix, DistanceMatrix>() {
@Override
public DistanceMatrix createAccumulator() {
return new DistanceMatrix();
}
@Override
public DistanceMatrix addInput(DistanceMatrix distanceMatrix, KV<String, Double> stringDoubleKV) {
distanceMatrix.addPoint(Integer.parseInt(stringDoubleKV.getKey().split("_")[0]),
Integer.parseInt(stringDoubleKV.getKey().split("_")[1]),stringDoubleKV.getValue());
return distanceMatrix;
}
@Override
public DistanceMatrix mergeAccumulators(Iterable<DistanceMatrix> iterable) {
DistanceMatrix distanceMatrix = new DistanceMatrix();
Iterator<DistanceMatrix> iterator = iterable.iterator();
while(iterator.hasNext()){
distanceMatrix.merge(iterator.next());
}
return distanceMatrix;
}
@Override
public DistanceMatrix extractOutput(DistanceMatrix distanceMatrix) {
return distanceMatrix;
}
}).named("Combine Distance Matrix").withoutDefaults());
//write to file
distanceMatrix.apply(ParDo.named("Matrix to String").of(new DoFn<DistanceMatrix, String>() {
@Override
public void processElement(ProcessContext processContext) throws Exception {
String temp = "<distance-matrix-entry>\n" + processContext.timestamp() + "\n";
temp += processContext.element().getDistanceValues().toString()+ "\n";
temp += processContext.element().getRow().toString() + "\n";
temp += processContext.element().getColumn().toString() + "\n";
temp += "</distance-matrix-entry>\n";
processContext.output(temp);
}
})).apply(TextIO.Write.named("Writing Output File").to(options.getOutputFilePath()));
pipeline.run();
System.exit(0);
}
}