import mpi.MPI; import mpi.MPIException; import mpi.MpiOps; import org.apache.commons.cli.*; import java.io.*; import java.util.*; public class PSVectorGenerator { private final String inFolder; private final String outFolder; private Map<Integer, VectorPoint> currentPoints = new HashMap<Integer, VectorPoint>(); private int days; private boolean mpi = false; private MpiOps mpiOps; private Date startDate; private Date endDate; private int mode; private TreeMap<String, List<Date>> dates = new TreeMap<String, List<Date>>(); private Map<String, CleanMetric> metrics = new HashMap<String, CleanMetric>(); public PSVectorGenerator(String inFile, String outFile, int days, boolean mpi, String startDate, String endDate, int mode) { this.days = days; this.inFolder = inFile; this.outFolder = outFile; this.mpi = mpi; this.startDate = Utils.parseDateString(startDate); this.endDate = Utils.parseDateString(endDate); this.mode = mode; } public void process() throws FileNotFoundException { System.out.println("starting vector generator..."); List<Date> dates; if (mode == 6 || mode == 9 || mode == 10) { Set<Date> dateSet = DateUtils.retrieveDates(this.inFolder); dates = DateUtils.sortDates(dateSet); } else { dates = new ArrayList<Date>(); } File inFolder = new File(this.inFolder); TreeMap<String, List<Date>> allDates = DateUtils.genDates(this.startDate, this.endDate, dates, this.mode); for (String dateString : allDates.keySet()) { System.out.println(dateString + " "); } // create the out directory Utils.createDirectory(outFolder); int filesPerProcess = 0; if (mpi) { try { mpiOps = new MpiOps(); int rank = mpiOps.getRank(); int size = mpiOps.getSize(); Iterator<String> datesItr = allDates.keySet().iterator(); int i = 0; while (datesItr.hasNext()) { String next = datesItr.next(); if (i == rank) { this.dates.put(next, allDates.get(next)); } i++; if (i == size) { i = 0; } } } catch (MPIException e) { e.printStackTrace(); } } else { this.dates = allDates; } // now go through the file and figure out the dates that should be considered Map<String, Map<Date, Integer>> datesList = findDates(this.inFolder); for (Map.Entry<String, List<Date>> ed : this.dates.entrySet()) { Date start = ed.getValue().get(0); Date end = ed.getValue().get(1); processFile(inFolder, start, end, ed.getKey(), datesList.get(ed.getKey())); } } private Map<String, Map<Date, Integer>> findDates(String inFile) { FileReader input = null; // a map of datestring -> map <date string, index> Map<String, Map<Date, Integer>> outDates = new HashMap<String, Map<Date, Integer>>(); Map<String, Set<Date>> tempDates = new HashMap<String, Set<Date>>(); // initialize temp dates for (String dateRange : this.dates.keySet()) { tempDates.put(dateRange, new TreeSet<Date>()); } try { input = new FileReader(inFile); BufferedReader bufRead = new BufferedReader(input); Record record; while ((record = Utils.parseFile(bufRead, null, false)) != null) { // check what date this record belongs to for (Map.Entry<String, List<Date>> ed : this.dates.entrySet()) { Date start = ed.getValue().get(0); Date end = ed.getValue().get(1); if (isDateWithing(start, end, record.getDate())) { Set<Date> tempDateList = tempDates.get(ed.getKey()); tempDateList.add(record.getDate()); } } } for (Map.Entry<String, Set<Date>> ed : tempDates.entrySet()) { Set<Date> datesSet = ed.getValue(); int i = 0; Map<Date, Integer> dateIntegerMap = new HashMap<Date, Integer>(); for (Date d : datesSet) { dateIntegerMap.put(d, i); i++; } outDates.put(ed.getKey(), dateIntegerMap); } } catch (FileNotFoundException e) { if (input != null) { try { input.close(); } catch (IOException ignore) { } } } for (Map.Entry<String, Set<Date>> ed : tempDates.entrySet()) { StringBuilder sb = new StringBuilder(); for (Date d : ed.getValue()) { sb.append(Utils.formatter.format(d)).append(" "); } System.out.println(ed.getKey() + ":" + sb.toString()); } return outDates; } private void printExistingVectors() { for (Map.Entry<Integer, VectorPoint> e : currentPoints.entrySet()) { System.out.println(e.getValue().serialize()); } } private void printDates(List dates) { StringBuilder sb = new StringBuilder(""); for (Object s : dates) { sb.append(s.toString()).append(" ,"); } System.out.println(sb.toString()); } /** * Process a stock file and generate vectors for a month or year period */ private void processFile(File inFile, Date startDate, Date endDate, String outFile, Map<Date, Integer> datesList) { BufferedWriter bufWriter = null; BufferedReader bufRead = null; System.out.println("Calc: " + outFile + Utils.formatter.format(startDate) + ":" + Utils.formatter.format(endDate)); int size = -1; vectorCounter = 0; int noOfDays = datesList.size(); String outFileName = outFolder + "/" + outFile + ".csv"; int capCount = 0; CleanMetric metric = this.metrics.get(outFileName); if (metric == null) { metric = new CleanMetric(); this.metrics.put(outFileName, metric); } try { FileReader input = new FileReader(inFile); FileOutputStream fos = new FileOutputStream(new File(outFileName)); bufWriter = new BufferedWriter(new OutputStreamWriter(fos)); bufRead = new BufferedReader(input); Record record; int count = 0; int fullCount = 0; double totalCap = 0; int splitCount = 0; while ((record = Utils.parseFile(bufRead, null, false)) != null) { // not a record we are interested in if (!isDateWithing(startDate, endDate, record.getDate())) { continue; } count++; int key = record.getSymbol(); if (record.getFactorToAdjPrice() > 0) { splitCount++; } // check weather we already have the vector seen VectorPoint point = currentPoints.get(key); if (point == null) { point = new VectorPoint(key, noOfDays, true); currentPoints.put(key, point); } // figure out the index int index = datesList.get(record.getDate()); double price = record.getPrice(); if (!point.add(price, record.getFactorToAdjPrice(), record.getFactorToAdjVolume(), metric, index)) { metric.dupRecords++; System.out.println("dup: " + record.serialize()); } if (price < 0) { price = -1 * price; } point.addCap(record.getVolume() * price); if (point.noOfElements() == size) { fullCount++; } // sort the already seen symbols and determine how many days are there in this period // we take the highest number as the number of days if (currentPoints.size() > 2000 && size == -1) { List<Integer> pointSizes = new ArrayList<Integer>(); for (VectorPoint v : currentPoints.values()) { pointSizes.add(v.noOfElements()); } size = mostCommon(pointSizes); System.out.println("Number of stocks per period: " + size); } // now write the current vectors, also make sure we have the size determined correctly if (currentPoints.size() > 1000 && size != -1 && fullCount > 750) { System.out.println("Processed: " + count); totalCap += writeVectors(bufWriter, noOfDays, metric); capCount++; fullCount = 0; } } System.out.println("Size: " + size); System.out.println("Split count: " + inFile.getName() + " = " + splitCount); // write the rest of the vectors in the map after finish reading the file totalCap += writeVectors(bufWriter, size, metric); capCount++; // write the constant vector at the end VectorPoint v = new VectorPoint(0, noOfDays, true); v.addCap(totalCap); bufWriter.write(v.serialize()); bufWriter.newLine(); v = new VectorPoint(1, noOfDays, true); v.addCap(totalCap); bufWriter.write(v.serialize()); bufWriter.newLine(); v = new VectorPoint(2, noOfDays, true); v.addCap(totalCap); bufWriter.write(v.serialize()); bufWriter.newLine(); v = new VectorPoint(3, noOfDays, true); v.addCap(totalCap); bufWriter.write(v.serialize()); bufWriter.newLine(); v = new VectorPoint(4, noOfDays, true); v.addCap(totalCap); bufWriter.write(v.serialize()); bufWriter.newLine(); System.out.println("Total stocks: " + vectorCounter + " bad stocks: " + currentPoints.size()); metric.stocksWithIncorrectDays = currentPoints.size(); System.out.println("Metrics for file: " + outFileName + " " + metric.serialize()); currentPoints.clear(); } catch (IOException e) { throw new RuntimeException("Failed to open the file", e); } finally { try { if (bufWriter != null) { bufWriter.close(); } if (bufRead != null) { bufRead.close(); } } catch (IOException ignore) { } } } public static <T> T mostCommon(List<T> list) { Map<T, Integer> map = new HashMap<T, Integer>(); for (T t : list) { Integer val = map.get(t); map.put(t, val == null ? 1 : val + 1); } Map.Entry<T, Integer> max = null; for (Map.Entry<T, Integer> e : map.entrySet()) { if (max == null || e.getValue() > max.getValue()) max = e; } return max.getKey(); } int vectorCounter = 0; /** * Write the current vector to file * @param bufWriter stream * @param size * @throws IOException */ private double writeVectors(BufferedWriter bufWriter, int size, CleanMetric metric) throws IOException { double capSum = 0; int count = 0; for(Iterator<Map.Entry<Integer, VectorPoint>> it = currentPoints.entrySet().iterator(); it.hasNext(); ) { Map.Entry<Integer, VectorPoint> entry = it.next(); VectorPoint v = entry.getValue(); if (v.noOfElements() == size) { metric.totalStocks++; if (!v.cleanVector(metric)) { // System.out.println("Vector not valid: " + outFileName + ", " + v.serialize()); metric.invalidStocks++; it.remove(); continue; } String sv = v.serialize(); // if many points are missing, this can return null if (sv != null) { capSum += v.getTotalCap(); count++; bufWriter.write(sv); bufWriter.newLine(); // remove it from map vectorCounter++; metric.writtenStocks++; } else { metric.invalidStocks++; } it.remove(); } else { metric.lenghtWrong++; } } return capSum; } private boolean isDateWithing(Date start, Date end, Date compare) { if (compare == null) { System.out.println("Comapre null*****************"); } return (compare.equals(start) || compare.after(start)) && compare.before(end); } public static void main(String[] args) { Options options = new Options(); options.addOption("i", true, "Input file"); options.addOption("o", true, "Output file"); options.addOption("d", true, "Number of days"); options.addOption("m", false, "MPI"); options.addOption("s", true, "Start date"); options.addOption("e", true, "End date"); options.addOption("md", true, "mode"); CommandLineParser commandLineParser = new BasicParser(); try { CommandLine cmd = commandLineParser.parse(options, args); String input = cmd.getOptionValue("i"); String output = cmd.getOptionValue("o"); String days = cmd.getOptionValue("d"); boolean mpi = cmd.hasOption("m"); String mode = cmd.getOptionValue("md"); String start = cmd.getOptionValue("s"); String end = cmd.getOptionValue("e"); if (mpi) { MPI.Init(args); } PSVectorGenerator vg = new PSVectorGenerator(input, output, Integer.parseInt(days), mpi, start, end, Integer.parseInt(mode)); vg.process(); if (mpi) { MPI.Finalize(); } } catch (ParseException e) { e.printStackTrace(); } catch (MPIException e) { e.printStackTrace(); } catch (FileNotFoundException e) { e.printStackTrace(); } } }