import mpi.MPI; import mpi.MPIException; import mpi.MpiOps; import org.apache.commons.cli.*; import java.io.*; import java.util.*; /** * Break up the file in to sub files with the dates that we want. * We will use the sub-files to parallely do the vector generation */ public class FileBreaker { private final String inFile; private final String outDir; private int mode; private boolean mpi; private MpiOps mpiOps; private Date startDate; private Date endDate; TreeMap<String, List<Date>> dates = new TreeMap<String, List<Date>>(); private enum DateCheckType { MONTH, YEAR, CONT_YEAR, DAY } public FileBreaker(String inFile, String outFile, String startDate, int mode, String endDate, boolean mpi) { this.mode = mode; this.inFile = inFile; this.outDir = outFile; this.startDate = Utils.parseDateString(startDate); this.endDate = Utils.parseDateString(endDate); this.mpi = mpi; } public void process() { TreeMap<String, List<Date>> dates = Utils.genDates(this.startDate, this.endDate, mode); Map<Integer, String> s = new HashMap<Integer, String>(); if (mpi) { try { mpiOps = new MpiOps(); int rank = mpiOps.getRank(); int size = mpiOps.getSize(); Iterator<String> datesItr = dates.keySet().iterator(); int i = 0; while (datesItr.hasNext()) { String next = datesItr.next(); if (i == rank) { this.dates.put(next, dates.get(next)); } i++; if (i == size) { i = 0; } } } catch (MPIException e) { e.printStackTrace(); } } else { this.dates = dates; } printDates(); processFile(inFile); } private void printDates() { StringBuilder sb = new StringBuilder(); for (Map.Entry<String, List<Date>> e : dates.entrySet()) { sb.append(e.getKey()).append(": "); for (Date d : e.getValue()) { sb.append(Utils.getMonthString(d)).append(" "); } sb.append("\n"); } System.out.println(sb.toString()); } private Set<String> getDatesForThisRecord(Record r) { Set<String> files = new HashSet<String>(); for (Map.Entry<String, List<Date>> entry : dates.entrySet()) { if (mode <= 4) { for (Date d : entry.getValue()) { if (check(d, r.getDate(), DateCheckType.MONTH)) { files.add(entry.getKey()); } } } else if (mode >= 5) { Date start = entry.getValue().get(0); Date end = entry.getValue().get(1); if (check(start, end, r.getDate(), DateCheckType.DAY)) { files.add(entry.getKey()); } } } return files; } /** * Process a stock file and generate vectors for a month or year period */ private void processFile(String inFile) { Map<String, List<Record>> records = new HashMap<String, List<Record>>(); Map<String, BufferedWriter> writers = new HashMap<String, BufferedWriter>(); CleanMetric metric = new CleanMetric(); for (String s : dates.keySet()) { String outFile = outDir + "/" + s + ".csv"; FileOutputStream fos; BufferedWriter bufWriter; try { fos = new FileOutputStream(new File(outFile)); bufWriter = new BufferedWriter(new OutputStreamWriter(fos)); writers.put(s, bufWriter); } catch (FileNotFoundException e) { throw new RuntimeException("Failed to create writer", e); } } int splitCount = 0; BufferedReader bufRead = null; try { int totalCount = 0; FileReader input = new FileReader(inFile); bufRead = new BufferedReader(input); Record record; int count = 0; while ((record = Utils.parseFile(bufRead, metric, true)) != null) { if (record.getFactorToAdjPrice() > 0) { splitCount++; } totalCount++; Set<String> files = getDatesForThisRecord(record); for (String f : files) { List<Record> l = records.get(f); if (l == null) { l = new ArrayList<Record>(); records.put(f, l); } l.add(record); count++; } if (count >= 1000000) { count = 0; System.out.println("Total count: " + totalCount); for (Map.Entry<String, List<Record>> e : records.entrySet()) { BufferedWriter w = writers.get(e.getKey()); for (Record r : e.getValue()) { w.write(r.serialize()); w.newLine(); } e.getValue().clear(); } } } for (Map.Entry<String, List<Record>> e : records.entrySet()) { BufferedWriter w = writers.get(e.getKey()); for (Record r : e.getValue()) { w.write(r.serialize()); w.newLine(); } e.getValue().clear(); } System.out.println("Split count for file: " + inFile + " = " + splitCount); System.out.println("Clean metric for file: " + metric.serialize()); } catch (IOException e) { throw new RuntimeException("Failed to open the file", e); } finally { try { for (BufferedWriter bf : writers.values()) { bf.close(); } if (bufRead != null) { bufRead.close(); } } catch (IOException ignore) { } } } private boolean check(Date data1, Date date2, DateCheckType check) { Calendar cal1 = Calendar.getInstance(); Calendar cal2 = Calendar.getInstance(); cal1.setTime(data1); cal2.setTime(date2); if (check == DateCheckType.MONTH) { if(cal1.get(Calendar.YEAR) == cal2.get(Calendar.YEAR) && cal1.get(Calendar.MONTH) == cal2.get(Calendar.MONTH)) { return true; } } else if (check == DateCheckType.YEAR) { if(cal1.get(Calendar.YEAR) == cal2.get(Calendar.YEAR)) { return true; } } return false; } private boolean check(Date start, Date end, Date compare, DateCheckType check) { if (compare == null) { System.out.println("Comapre null*****************"); } return (compare.equals(start) || compare.after(start)) && compare.before(end); } public static void main(String[] args) { Options options = new Options(); options.addOption("i", true, "Input file"); options.addOption("o", true, "Output directory"); options.addOption("s", true, "Start date"); options.addOption("e", true, "End date"); options.addOption("m", false, "mpi"); options.addOption("d", true, "Mode, 1 - month, 2 year, 3 whole, 4 continous year"); CommandLineParser commandLineParser = new BasicParser(); try { CommandLine cmd = commandLineParser.parse(options, args); String input = cmd.getOptionValue("i"); String output = cmd.getOptionValue("o"); String date = cmd.getOptionValue("s"); String end = cmd.getOptionValue("e"); String days = cmd.getOptionValue("d"); boolean mpi = cmd.hasOption("m"); if (mpi) { MPI.Init(args); } FileBreaker vg = new FileBreaker(input, output, date, Integer.parseInt(days), end, mpi); vg.process(); if (mpi) { MPI.Finalize(); } } catch (ParseException e) { e.printStackTrace(); } catch (MPIException e) { e.printStackTrace(); } } }