import mpi.MPI;
import mpi.MPIException;
import mpi.MpiOps;
import org.apache.commons.cli.*;
import java.io.*;
import java.util.*;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
public class PVectorGenerator {
private final String inFolder;
private final String outFolder;
private Map<Integer, VectorPoint> currentPoints = new HashMap<Integer, VectorPoint>();
private int days;
private boolean mpi = false;
private MpiOps mpiOps;
private Map<String, CleanMetric> metrics = new HashMap<String, CleanMetric>();
private enum DateCheckType {
MONTH,
YEAR,
CONT_YEAR,
}
public PVectorGenerator(String inFile, String outFile, int days, boolean mpi) {
this.days = days;
this.inFolder = inFile;
this.outFolder = outFile;
this.mpi = mpi;
}
public void process() {
System.out.println("starting vector generator...");
File inFolder = new File(this.inFolder);
if (!inFolder.isDirectory()) {
System.out.println("In should be a folder");
return;
}
// create the out directory
Utils.createDirectory(outFolder);
int rank = 0;
int size = 0;
int filesPerProcess = 0;
try {
BlockingQueue<File> files = new LinkedBlockingQueue<File>();
if (mpi) {
mpiOps = new MpiOps();
rank = mpiOps.getRank();
size = mpiOps.getSize();
int i = 0;
for (int j = 0; j < inFolder.listFiles().length; j++) {
File fileEntry = inFolder.listFiles()[j];
if (i == rank) {
files.put(fileEntry);
}
i++;
if (i == size) {
i = 0;
}
}
} else {
Collections.addAll(files, inFolder.listFiles());
}
List<Thread> threads = new ArrayList<Thread>();
// start threads
for (int i = 0; i < 1; i++) {
Thread t = new Thread(new Worker(files));
t.start();
threads.add(t);
}
for (Thread t : threads) {
try {
t.join();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
System.out.println("Vector generator finished...");
} catch (MPIException e) {
throw new RuntimeException("Failed to communicate");
} catch (InterruptedException e) {
e.printStackTrace();
}
}
private class Worker implements Runnable {
private BlockingQueue<File> queue;
private Worker(BlockingQueue<File> queue) {
this.queue = queue;
}
@Override
public void run() {
System.out.println("Vector generator files to proce: " + queue.size());
while (!queue.isEmpty()) {
try {
File f = queue.take();
processFile(f);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
for (Map.Entry<String, CleanMetric> e : metrics.entrySet()) {
System.out.println(e.getKey() + " : " + e.getValue().serialize());
}
}
}
private void printExistingVectors() {
for (Map.Entry<Integer, VectorPoint> e : currentPoints.entrySet()) {
System.out.println(e.getValue().serialize());
}
}
private void printDates(List dates) {
StringBuilder sb = new StringBuilder("");
for (Object s : dates) {
sb.append(s.toString()).append(" ,");
}
System.out.println(sb.toString());
}
/**
* Process a stock file and generate vectors for a month or year period
*/
private void processFile(File inFile) {
BufferedWriter bufWriter = null;
BufferedReader bufRead = null;
int size = -1;
vectorCounter = 0;
String outFileName = outFolder + "/" + inFile.getName();
CleanMetric metric = this.metrics.get(outFileName);
if (metric == null) {
metric = new CleanMetric();
this.metrics.put(outFileName, metric);
}
try {
FileReader input = new FileReader(inFile);
FileOutputStream fos = new FileOutputStream(new File(outFileName));
bufWriter = new BufferedWriter(new OutputStreamWriter(fos));
bufRead = new BufferedReader(input);
Record record;
int count = 0;
int fullCount = 0;
double totalCap = 0;
int capCount = 0;
int splitCount = 0;
while ((record = Utils.parseFile(bufRead, null, false)) != null) {
count++;
int key = record.getSymbol();
if (record.getFactorToAdjPrice() > 0) {
splitCount++;
}
// check weather we already have the vector seen
VectorPoint point = currentPoints.get(key);
if (point == null) {
point = new VectorPoint(key, days);
currentPoints.put(key, point);
}
if (!point.isFull()) {
point.add(record.getPrice(), record.getFactorToAdjPrice(), record.getFactorToAdjVolume(), metric);
point.addCap(record.getVolume() * record.getPrice());
} else {
System.out.println("Point full cannot add more....");
}
if (point.noOfElements() == size) {
fullCount++;
}
// sort the already seen symbols and determine how many days are there in this period
// we take the highest number as the number of days
if (currentPoints.size() > 1000 && size == -1) {
List<Integer> pointSizes = new ArrayList<Integer>();
for (VectorPoint v : currentPoints.values()) {
pointSizes.add(v.noOfElements());
}
size = mostCommon(pointSizes);
System.out.println("Number of stocks per period: " + size);
}
// now write the current vectors, also make sure we have the size determined correctly
if (currentPoints.size() > 1000 && size != -1 && fullCount > 750) {
System.out.println("Processed: " + count);
totalCap += writeVectors(bufWriter, size, metric);
capCount++;
fullCount = 0;
}
}
System.out.println("Size: " + size);
System.out.println("Split count: " + inFile.getName() + " = " + splitCount);
// write the rest of the vectors in the map after finish reading the file
totalCap += writeVectors(bufWriter, size, metric);
capCount++;
// write the constant vector at the end
VectorPoint v = new VectorPoint(0, new double[]{0});
v.addCap(totalCap);
bufWriter.write(v.serialize());
System.out.println("Total stocks: " + vectorCounter + " bad stocks: " + currentPoints.size());
metric.stocksWithIncorrectDays = currentPoints.size();
System.out.println("Metrics for file: " + outFileName + " " + metric.serialize());
currentPoints.clear();
} catch (IOException e) {
throw new RuntimeException("Failed to open the file", e);
} finally {
try {
if (bufWriter != null) {
bufWriter.close();
}
if (bufRead != null) {
bufRead.close();
}
} catch (IOException ignore) {
}
}
}
public static <T> T mostCommon(List<T> list) {
Map<T, Integer> map = new HashMap<T, Integer>();
for (T t : list) {
Integer val = map.get(t);
map.put(t, val == null ? 1 : val + 1);
}
Map.Entry<T, Integer> max = null;
for (Map.Entry<T, Integer> e : map.entrySet()) {
if (max == null || e.getValue() > max.getValue())
max = e;
}
return max.getKey();
}
int vectorCounter = 0;
/**
* Write the current vector to file
* @param bufWriter stream
* @param size
* @throws IOException
*/
private double writeVectors(BufferedWriter bufWriter, int size, CleanMetric metric) throws IOException {
double capSum = 0;
int count = 0;
for(Iterator<Map.Entry<Integer, VectorPoint>> it = currentPoints.entrySet().iterator(); it.hasNext(); ) {
Map.Entry<Integer, VectorPoint> entry = it.next();
VectorPoint v = entry.getValue();
if (v.noOfElements() == size) {
metric.totalStocks++;
if (!v.cleanVector(metric)) {
// System.out.println("Vector not valid: " + outFileName + ", " + v.serialize());
metric.invalidStocks++;
it.remove();
continue;
}
String sv = v.serialize();
// if many points are missing, this can return null
if (sv != null) {
capSum += v.getTotalCap();
count++;
bufWriter.write(sv);
bufWriter.newLine();
// remove it from map
vectorCounter++;
} else {
metric.invalidStocks++;
}
it.remove();
} else {
metric.lenghtWrong++;
}
}
return capSum;
}
private boolean check(Date data1, Date date2, DateCheckType check) {
Calendar cal1 = Calendar.getInstance();
Calendar cal2 = Calendar.getInstance();
cal1.setTime(data1);
cal2.setTime(date2);
if (check == DateCheckType.MONTH) {
if(cal1.get(Calendar.YEAR) == cal2.get(Calendar.YEAR) && cal1.get(Calendar.MONTH) == cal2.get(Calendar.MONTH)) {
return true;
}
} else if (check == DateCheckType.YEAR) {
if(cal1.get(Calendar.YEAR) == cal2.get(Calendar.YEAR)) {
return true;
}
}
return false;
}
public static void main(String[] args) {
Options options = new Options();
options.addOption("i", true, "Input file");
options.addOption("o", true, "Output file");
options.addOption("d", true, "Number of days");
options.addOption("m", false, "MPI");
CommandLineParser commandLineParser = new BasicParser();
try {
CommandLine cmd = commandLineParser.parse(options, args);
String input = cmd.getOptionValue("i");
String output = cmd.getOptionValue("o");
String days = cmd.getOptionValue("d");
boolean mpi = cmd.hasOption("m");
if (mpi) {
MPI.Init(args);
}
PVectorGenerator vg = new PVectorGenerator(input, output, Integer.parseInt(days), mpi);
vg.process();
if (mpi) {
MPI.Finalize();
}
} catch (ParseException e) {
e.printStackTrace();
} catch (MPIException e) {
e.printStackTrace();
}
}
}