import org.apache.commons.cli.*; import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation; import java.io.*; import java.text.DecimalFormat; import java.text.NumberFormat; import java.util.*; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; public class Histogram { private String vectorFolder; private String distFolder; private String stockFile; private static int INC = 7000; private int bins = 10; private boolean global = false; // we will use this many files to generate the global bins static public int GLOBAL_FILE_COUNT = 10; private NumberFormat formatter = new DecimalFormat("#0.00"); public Histogram(String vectorFolder, String distFolder, int bins, String stockFile, boolean global) { this.vectorFolder = vectorFolder; this.distFolder = distFolder; this.bins = bins; this.stockFile = stockFile; this.global = global; } public static void main(String[] args) { Options options = new Options(); options.addOption("v", true, "Input Vector folder"); options.addOption("d", true, "Destination folder"); // Destination folder options.addOption("b", true, "Number of bins"); options.addOption("s", true, "Stock file"); // Original global file options.addOption(Utils.createOption("g", false, "Use global bins", false)); CommandLineParser commandLineParser = new BasicParser(); try { CommandLine cmd = commandLineParser.parse(options, args); String _vectorFile = cmd.getOptionValue("v"); String _distFile = cmd.getOptionValue("d"); int bins = Integer.parseInt(cmd.getOptionValue("b")); String stockFile = cmd.getOptionValue("s"); boolean globalBins = cmd.hasOption("g"); Histogram histogram = new Histogram(_vectorFile, _distFile, bins, stockFile, globalBins); histogram.process(); } catch (ParseException e) { System.out.println(options.toString()); } } private void process() { System.out.println("Starting Histogram calculator..."); File inFolder = new File(vectorFolder); if (!inFolder.isDirectory()) { System.out.println("In should be a folder"); return; } Map<Integer, String> permNoToSymbol = Utils.loadMapping(stockFile); // create the out directory Utils.createDirectory(distFolder); BlockingQueue<File> files = new LinkedBlockingQueue<File>(); List<File> list = new ArrayList<File>(); Collections.addAll(list, inFolder.listFiles()); Collections.sort(list); files.addAll(list); if (!this.global) { for (File f : files) { String outFileName = distFolder + "/" + f.getName(); System.out.println("generate histogram: " + outFileName); Bin[] bins = genHistoGram(f, this.bins, permNoToSymbol); writeBins(outFileName, bins); } } else { Bin []globalBins = genGlobalBins(list); for (File f : files) { String outFileName = distFolder + "/" + f.getName(); System.out.println("generate histogram: " + outFileName); Bin[] bins = genHistoUsingGlobalBins(f, this.bins, permNoToSymbol, globalBins); writeBins(outFileName, bins); } } System.out.println("Histogram calculator finished..."); } public void writeBins(String outFile, Bin[] bins) { BufferedWriter bufWriter = null; try { FileOutputStream fos = new FileOutputStream(new File(outFile)); bufWriter = new BufferedWriter(new OutputStreamWriter(fos)); for (int i = 0; i < bins.length; i++) { String s = i + "," + bins[i].serializeSymbols(); bufWriter.write(s); bufWriter.newLine(); } } catch (IOException e) { throw new RuntimeException("Faile to write bins", e); } finally { if (bufWriter != null) { try { bufWriter.close(); } catch (IOException e) { e.printStackTrace(); } } } } public Bin[] genHistoUsingGlobalBins(File inFile, int noOfBins, Map<Integer, String> permNoToSymbol, Bin []globalBins) { Map<Integer, Double> vecs = proceeVectorFile(inFile); List<Double> values = new ArrayList<Double>(vecs.values()); Collections.sort(values); Bin []bins = new Bin[noOfBins]; for (int i = 0; i < noOfBins; i++) { Bin bin = new Bin(); bin.start = globalBins[i].start; bin.end = globalBins[i].end; bins[i] = bin; } for (Map.Entry<Integer, Double> e : vecs.entrySet()) { int perm = e.getKey(); double val = e.getValue(); Bin b = getBinIndex(val, bins); b.permNos.add(perm); b.symbols.add(permNoToSymbol.get(perm)); } return bins; } public Bin[] genGlobalBins(List<File> allFiles) { System.out.println("Generating global bins: " + this.bins); // we will use a pre defined number of files to generate the global bins int size = allFiles.size(); int ratio = size / GLOBAL_FILE_COUNT; double max = Double.MIN_VALUE; double min = Double.MAX_VALUE; for (int i = 0; i < size - ratio; i++) { File f = allFiles.get(i + ratio); Map<Integer, Double> vecs = proceeVectorFile(f); StandardDeviation percentile = new StandardDeviation(); List<Double> values = new ArrayList<Double>(vecs.values()); double array[] = new double[values.size()]; double sum = 0; for (int k = 0; k < array.length; k++) { array[k] = values.get(k); sum += array[k]; if (Double.isNaN(array[k])) { System.out.println("NAN"); } } double mean = sum / array.length; double p = percentile.evaluate(array); Collections.sort(values); if (values.size() <= 1) continue; double fileMax = mean + p; double fileMin = mean - p; if (fileMax > max) { max = fileMax; } if (fileMin < min) { min = fileMin; } } double delta = (max - min) / this.bins; Bin []bins = new Bin[this.bins]; for (int i = 0; i < bins.length; i++) { Bin b = new Bin(); b.start = min + i * delta; b.end = min + (i + 1)* (delta); bins[i] = b; } System.out.println("Global bins MAX: " + max + " MIN: " + min); return bins; } public Bin[] genHistoGram(File inFile, int noOfBins, Map<Integer, String> permNoToSymbol) { Map<Integer, Double> vecs = proceeVectorFile(inFile); List<Double> values = new ArrayList<Double>(vecs.values()); Collections.sort(values); int binSize = values.size() / bins; Bin []bins = new Bin[noOfBins]; for (int i = 0; i < noOfBins; i++) { Bin bin = new Bin(); bin.start = values.get(i * binSize); int index = (i + 1) * binSize; if (index >= values.size()) index = values.size() - 1; bin.end = values.get(index); bins[i] = bin; } for (Map.Entry<Integer, Double> e : vecs.entrySet()) { int perm = e.getKey(); double val = e.getValue(); Bin b = getBinIndex(val, bins); b.permNos.add(perm); b.symbols.add(permNoToSymbol.get(perm)); } return bins; } private Bin getBinIndex(double val, Bin []bins) { for (int i = 0; i < bins.length; i++) { Bin b = bins[i]; // add all that is below the 0'th bin to 0 if (val < b.start) { return b; } if (b.start <= val && b.end >= val) { return b; } } // add all that is over the last bin value to last return bins[bins.length - 1]; } public Map<Integer, Double> proceeVectorFile(File inFile) { System.out.println("Generating histogram for stocks file: " + inFile); Map<Integer, Double> deltas = new HashMap<Integer, Double>(); if (!inFile.exists()) { System.out.println("ERROR: In file doens't exist"); return null; } int startIndex = 0; int endIndex = -1; List<VectorPoint> vectors; do { startIndex = endIndex + 1; endIndex = startIndex + INC - 1; vectors = Utils.readVectors(inFile, startIndex, endIndex); if (vectors.size() == 0) { break; } // write the vectors to file for (VectorPoint v : vectors) { double delta = vectorDelta(v.getNumbers()); deltas.put(v.getKey(), delta); } } while (true); return deltas; } private double vectorDelta(double []n) { double sum = 0.0; for (double aN : n) { sum += aN; } if (sum == 0) return .1; // double delta = max - min; double delta = Utils.lastNonZero(n) - Utils.firstNonZero(n); return delta * n.length / sum; } }