import org.apache.commons.cli.*; import org.apache.commons.io.FilenameUtils; import java.io.*; import java.util.*; public class VectorGenerator { private final String inFile; private final String outFile; private Map<Integer, VectorPoint> currentPoints = new HashMap<Integer, VectorPoint>(); private int days; private Date startDate; private Date endDate; private enum DateCheckType { MONTH, YEAR, CONT_YEAR, } public VectorGenerator(String inFile, String outFile, String startDate, int days, String endDate) { this.days = days; this.inFile = inFile; this.outFile = outFile; this.startDate = Utils.parseDateString(startDate); this.endDate = Utils.parseDateString(endDate); } public void process() { Date currentDate = startDate; if (days <= 30) { while (!check(currentDate, endDate, DateCheckType.MONTH)) { System.out.println("Processing: " + Utils.getMonthString(currentDate)); processFile(inFile, currentDate, outFile + "/" + Utils.getMonthString(currentDate) + ".csv"); currentDate = Utils.addMonth(currentDate); currentPoints.clear(); } } else if (days < 400) { while (!check(currentDate, endDate, DateCheckType.YEAR)) { System.out.println("Processing: " + Utils.getYearString(currentDate)); processFile(inFile, currentDate, outFile + "/" + Utils.getYearString(currentDate) + ".csv"); currentDate = Utils.addYear(currentDate); currentPoints.clear(); } } else { System.out.println("Processing whole file"); File in = new File(inFile); String fileName = in.getName(); String fileNameWithOutExt = FilenameUtils.removeExtension(fileName); processFile(inFile, currentDate, outFile + "/" + fileNameWithOutExt + ".csv"); currentPoints.clear(); } } private void printExistingVectors() { for (Map.Entry<Integer, VectorPoint> e : currentPoints.entrySet()) { System.out.println(e.getValue().serialize()); } } private void printDates(List dates) { StringBuilder sb = new StringBuilder(""); for (Object s : dates) { sb.append(s.toString()).append(" ,"); } System.out.println(sb.toString()); } /** * Process a stock file and generate vectors for a month or year period */ private void processFile(String inFile, Date date, String outFile) { BufferedWriter bufWriter = null; BufferedReader bufRead = null; int size = -1; vectorCounter = 0; try { FileReader input = new FileReader(inFile); FileOutputStream fos = new FileOutputStream(new File(outFile)); bufWriter = new BufferedWriter(new OutputStreamWriter(fos)); bufRead = new BufferedReader(input); Record record; int count = 0; int fullCount = 0; while ((record = Utils.parseFile(bufRead)) != null) { count++; // check weather we are interested in this record boolean check; if (days <= 30) { check = check(date, record.getDate(), DateCheckType.MONTH); } else if (days < 400) { check = check(date, record.getDate(), DateCheckType.YEAR); } else { check = true; } // if we are interested in this record if (check) { int key = record.getSymbol(); // check weather we already have the vector seen VectorPoint point = currentPoints.get(key); if (point == null) { point = new VectorPoint(key, days); currentPoints.put(key, point); } point.add(record.getPrice()); point.addCap(record.getVolume() * record.getPrice()); if (point.noOfElements() == size) { fullCount++; } // sort the already seen symbols and determine how many days are there in this period // we take the highest number as the number of days if (currentPoints.size() > 1000 && size == -1) { List<Integer> pointSizes = new ArrayList<Integer>(); for (VectorPoint v : currentPoints.values()) { pointSizes.add(v.noOfElements()); } size = mostCommon(pointSizes); System.out.println("Number of stocks per period: " + size); } // now write the current vectors, also make sure we have the size determined correctly if (currentPoints.size() > 1000 && size != -1 && fullCount > 750) { System.out.println("Processed: " + count); writeVectors(bufWriter, size); fullCount = 0; } } } System.out.println("Size: " + size); // write the rest of the vectors in the map after finish reading the file writeVectors(bufWriter, size); System.out.println("Total stocks: " + vectorCounter + " bad stocks: " + currentPoints.size()); } catch (IOException e) { throw new RuntimeException("Failed to open the file"); } finally { try { if (bufWriter != null) { bufWriter.close(); } if (bufRead != null) { bufRead.close(); } } catch (IOException ignore) { } } } public static <T> T mostCommon(List<T> list) { Map<T, Integer> map = new HashMap<T, Integer>(); for (T t : list) { Integer val = map.get(t); map.put(t, val == null ? 1 : val + 1); } Map.Entry<T, Integer> max = null; for (Map.Entry<T, Integer> e : map.entrySet()) { if (max == null || e.getValue() > max.getValue()) max = e; } return max.getKey(); } int vectorCounter = 0; /** * Write the current vector to file * @param bufWriter stream * @param size * @throws IOException */ private void writeVectors(BufferedWriter bufWriter, int size) throws IOException { double totalCap = 0; for(Iterator<Map.Entry<Integer, VectorPoint>> it = currentPoints.entrySet().iterator(); it.hasNext(); ) { Map.Entry<Integer, VectorPoint> entry = it.next(); VectorPoint v = entry.getValue(); totalCap += v.getTotalCap(); if (v.noOfElements() == size) { String sv = v.serialize(); // if many points are missing, this can return null if (sv != null) { bufWriter.write(sv); bufWriter.newLine(); // remove it from map it.remove(); vectorCounter++; } } } // write the constant vector VectorPoint v = new VectorPoint(0, 0); v.addCap(totalCap / 10); bufWriter.write(v.serialize()); } private boolean check(Date data1, Date date2, DateCheckType check) { Calendar cal1 = Calendar.getInstance(); Calendar cal2 = Calendar.getInstance(); cal1.setTime(data1); cal2.setTime(date2); if (check == DateCheckType.MONTH) { if(cal1.get(Calendar.YEAR) == cal2.get(Calendar.YEAR) && cal1.get(Calendar.MONTH) == cal2.get(Calendar.MONTH)) { return true; } } else if (check == DateCheckType.YEAR) { if(cal1.get(Calendar.YEAR) == cal2.get(Calendar.YEAR)) { return true; } } return false; } public static void main(String[] args) { Options options = new Options(); options.addOption("i", true, "Input file"); options.addOption("o", true, "Output file"); options.addOption("s", true, "Start date"); options.addOption("e", true, "End date"); options.addOption("d", true, "Number of days"); CommandLineParser commandLineParser = new BasicParser(); try { CommandLine cmd = commandLineParser.parse(options, args); String input = cmd.getOptionValue("i"); String output = cmd.getOptionValue("o"); String date = cmd.getOptionValue("s"); String end = cmd.getOptionValue("e"); String days = cmd.getOptionValue("d"); VectorGenerator vg = new VectorGenerator(input, output, date, Integer.parseInt(days), end); vg.process(); } catch (ParseException e) { e.printStackTrace(); } } }