/** * GeDBIT.util.Histogram 2006.05.31 * * Copyright Information: * * Change Log: * 2006.05.31: Copied from original jdb package, by Rui Mao */ package GeDBIT.util; import java.io.BufferedReader; import java.io.FileReader; import java.util.TreeMap; import java.util.Map; import java.util.Iterator; import java.util.ArrayList; import java.math.BigInteger; /** * Given values, 1-d or 2-d, count the histogram * * @author Rui Mao * @version 2006.05.31 */ public class Histogram { static final double Delta = 0.0000000000001; public static class BinInfo { final double center; // center value of the bin double lower, upper; // lower and upper bounds of the bin int size; // number of points in the bin. final double width; // left inclusive, right exclusive public BinInfo(double center, double width) { if (width <= 0) throw new IllegalArgumentException( "Bin width must be positive!"); this.center = center; this.width = width; this.lower = Double.POSITIVE_INFINITY; this.upper = Double.NEGATIVE_INFINITY; this.size = 0; } public BinInfo addPoint(double value) { if ((value - (center - width / 2) < -Delta) || (value - (center + width / 2) >= Delta)) throw new IllegalArgumentException("value not in this bin!"); if (lower > value) lower = value; if (upper < value) upper = value; size++; return this; } public int size() { return size; } public double lower() { return lower; } public double upper() { return upper; } public boolean cover(double value) { if ((value < center - width / 2) || (value >= center + width / 2)) return false; return true; } } /** * Computes a one-dimensional histogram * * @param start * the starting boundary of bins, bins can be left to this * starting point. * @param width * width of bins, must be positive * @param dist * the array of distances to compute histogram * @param first * inclusive * @param last * exclusive * @return a List of BinInfo, non-empty, non-overlaping. sorted asendingly * by the center value. */ public static ArrayList<BinInfo> completeOneDHistogram(double start, double width, double[] dist, int first, int last) { TreeMap<Integer, BinInfo> map = new TreeMap<Integer, BinInfo>(); for (int i = first; i < last; i++) { double temp = (dist[i] - start) / width; int offset = (Math.abs(temp - Math.ceil(temp)) <= Delta) ? (int) Math .ceil(temp) : (int) Math.floor(temp); if (map.containsKey(offset)) { map.put(offset, map.get(offset).addPoint(dist[i])); } else map.put(offset, (new BinInfo(start + (offset + 0.5) * width, width)).addPoint(dist[i])); } ArrayList<BinInfo> result = new ArrayList<BinInfo>(map.size()); result.addAll(map.values()); return result; } /** * Compute one-dimensional histogram * * @param start * the starting boundary of bins * @param width * width of bins * @param dist * the array of distances to compute histogram * @return a 2-d double array of 2 rows. the first row is the upper bound of * each bin, the second row is the corresponding bin size */ public static double[][] oneDHistogram(double start, double width, double[] dist) { TreeMap<Integer, Integer> map = new TreeMap<Integer, Integer>(); for (int i = 0; i < dist.length; i++) { int offset = (int) Math.floor((dist[i] - start) / width); if (map.containsKey(offset)) { map.put(offset, map.get(offset) + 1); } else map.put(offset, 1); } int first = map.firstKey(); int last = map.lastKey(); double[][] histogram = new double[2][last - first + 1]; for (int i = 0; i < histogram[0].length; i++) { histogram[0][i] = start + width * (i + first + 1); histogram[1][i] = 0; } for (Map.Entry<Integer, Integer> entry : map.entrySet()) { histogram[1][entry.getKey() - first] = entry.getValue(); } return histogram; } /** * Given the 2-d (x-y) values, ranges and number of bins, out put the * (non-zero) number of occurances of each 2-d value. A large 2-d matrix * will be used internally, therefore, might not work if numbers of bins are * too large. * * @param xMin * minimum value of x * @param xMax * maximum value of x * @param xSize * number of bins for x, or more accurately, how many parts the x * value range will be divided into * @param xValue * values of x * @param yMin * minimum value of y * @param yMax * maximum value of y * @param ySize * number of bins for y, or more accurately, how many parts the y * value range will be divided into * @param yValue * values of y * @return the occurances //3-d vector, first row are x-values, second * y-values, third row consists of occurances */ public static int[][] TwoDHist(double xMin, double xMax, int xSize, double[] xValue, double yMin, double yMax, int ySize, double[] yValue) { // check argument if (xValue.length != yValue.length) throw new IllegalArgumentException( "x, y values of different length!"); final double xBinWidth = (xMax - xMin) / xSize; final double yBinWidth = (yMax - yMin) / ySize; int[][] counter = new int[xSize + 1][ySize + 1]; // allocate 1 more row // and column to // allow the maximum // value for (int i = 0; i < xSize + 1; i++) for (int j = 0; j < ySize + 1; j++) counter[i][j] = 0; // count the occurances for (int i = 0; i < xValue.length; i++) counter[(int) ((xValue[i] - xMin) / xBinWidth)][(int) ((yValue[i] - yMin) / yBinWidth)]++; // count number of non-zero occurance /* * int length =0; for (int i=0; i< xSize+1; i++) for (int j=0; * j<ySize+1; j++) if (counter[i][j] > 0) length ++; //out put double * [][] result = new double [3][length]; length =0; for (int i=0; i< * xSize; i++) for (int j=0; j<ySize; j++) if (counter[i][j] >0) { * result[0][length] = xMin +i*xBinWidth; result[1][length] = yMin * +j*yBinWidth; result[2][length] = counter[i][j]; length ++; } */ return counter; } /** * Reads a file consisting of coordiantes of points in pivot-space, i.e. * distances to each pivot. the file should be a text file. The first part, * separted by whitespace, of the first line should be a number, the number * of pivots. then each following line is the description of a pivot, in the * order. then an empty line. then, the first part of next line should be a * number, number of points then each following line should be the * coordinates of a point, separted by comma. in the command line, the first * parameter should be the file name, then a series of numbers or chars, x0, * x1, ..., xn, corresponding to dimensions d0, d1, ..., dn, less number of * number/chars can be provided, they will start at dim 0, dims not * specified will be ignored. if xi is '-', then dimension i is ignored, if * xi is '.', then the histogram on dimension i will be computed, if xi is a * number, then the value on dimension i should be fixed as that number, * other values are ignored. min, max values of each dimension to compute * histogram will be reported. if only one dimension to compute histogram, * the histogram will be out put in one line, if two, then a matrix, if * more, then each non-zero tuple will be output. the out put is to the * screen. * * @param args * the first should be the file name, then a series of numbers or * chars, x0, x1, ..., xn, corresponding to dimensions d0, d1, * ..., dn. */ @SuppressWarnings("rawtypes") public static void pivotSpaceHistogram(String[] args) throws Exception { // read file BufferedReader fileReader = new BufferedReader(new FileReader(args[0])); // read the pivots String line = fileReader.readLine().trim(); String[] lineSegment = line.split("[ \t\n\f\r]"); final int pivotNumber = Integer.parseInt(lineSegment[0]); // read all the pivots and the empty line following them String[] pivot = new String[pivotNumber]; for (int i = 0; i < pivotNumber; i++) pivot[i] = fileReader.readLine(); fileReader.readLine(); // empty line // read dataset size line = fileReader.readLine().trim(); lineSegment = line.split("[ \t\n\f\r]"); final int dataSize = Integer.parseInt(lineSegment[0]); // parse the argument, set fixed values, and dims to compute histogram int[] fixedValue = new int[args.length - 1]; boolean[] isFixed = new boolean[pivotNumber]; boolean[] toCompute = new boolean[pivotNumber]; for (int i = 0; i < pivotNumber; i++) { isFixed[i] = false; toCompute[i] = false; } for (int i = 1; i < args.length; i++) { if (args[i].equals(".")) toCompute[i - 1] = true; else if (!args[i].equals("-")) { isFixed[i - 1] = true; fixedValue[i - 1] = Integer.parseInt(args[i]); } } // read each line of coordinates and compute the histogram TreeMap<String, BigInteger> map = new TreeMap<String, BigInteger>(); String key = new String(""); StringBuffer keyBuffer = null; boolean valid = false; // whether current line is consistent with the // fixed values // System.out.println(pivotNumber + " " + dataSize); for (int j = 0; j < dataSize; j++) { line = fileReader.readLine().trim(); lineSegment = line.split("[ \t\n\f\r]*,[ \t\n\f\r]*"); // check consistency valid = true; for (int i = 0; i < pivotNumber; i++) if (isFixed[i] && ((int) Double.parseDouble(lineSegment[i]) != fixedValue[i])) { valid = false; break; } if (!valid) continue; // valid, continue to compute histogram keyBuffer = new StringBuffer(); for (int i = 0; i < pivotNumber; i++) if (toCompute[i]) keyBuffer.append((char) (int) Double .parseDouble(lineSegment[i])); key = new String(keyBuffer); if (map.containsKey(key)) map.put(key, ((BigInteger) map.get(key)).add(BigInteger.ONE)); else map.put(key, new BigInteger("1")); } // finish reading file, output fileReader.close(); System.out.print("#Fixed pivot-distances: [ "); for (int i = 0; i < pivotNumber; i++) if (isFixed[i]) System.out.print("pivot " + pivot[i] + " = " + fixedValue[i] + ", "); System.out.print(" ], "); // find the min, max value int histDim = 0; // number of dimensions to compute histogram for (int i = 0; i < pivotNumber; i++) if (toCompute[i]) histDim++; int[] mapping = new int[histDim]; // mapping[i] the dim number of the // ith dim to compute // histogram int j = 0; for (int i = 0; i < pivotNumber; i++) if (toCompute[i]) { mapping[j] = i; j++; } int[] min = new int[histDim]; int[] max = new int[histDim]; for (int i = 0; i < histDim; i++) { min[i] = Integer.MAX_VALUE; max[i] = 0; } Iterator p = map.entrySet().iterator(); Map.Entry entry = null; double var = 0; // variance double maximum = 0; // maximum value in histogram double sum = 0; // total number of points in histogram while (p.hasNext()) { entry = (Map.Entry) p.next(); key = (String) entry.getKey(); for (int i = 0; i < histDim; i++) { if ((int) key.charAt(i) < min[i]) min[i] = (int) key.charAt(i); if ((int) key.charAt(i) > max[i]) max[i] = (int) key.charAt(i); } double temp = ((BigInteger) entry.getValue()).intValue(); sum += temp; var += temp * temp; if (maximum < temp) maximum = temp; } System.out.println("map size = " + map.size() + ", dataset size = " + sum + ", pivot number = " + pivotNumber); // var /= map.size(); System.out.print("#[min, max] values of each dim of histogram: "); for (int i = 0; i < histDim; i++) System.out.print("pivot " + pivot[mapping[i]] + ": [" + min[i] + ", " + max[i] + " ], "); System.out.println(); // out put the histogram int[][] histogram2D = null; int[] histogram1D = null; if (histDim == 2) { if (((max[0] - min[0] + 1) > 0) && ((max[1] - min[1] + 1) > 0)) { histogram2D = new int[max[0] - min[0] + 1][max[1] - min[1] + 1]; for (int i = 0; i < max[0] - min[0] + 1; i++) for (j = 0; j < max[1] - min[1] + 1; j++) histogram2D[i][j] = 0; System.out .println("#Histogram: " + " std. dev. = " + Math.sqrt(var / (histogram2D.length * histogram2D[0].length) - Math.pow( sum / (histogram2D.length * histogram2D[0].length), 2)) + ", std. dev. of non-zero buckets = " + Math.sqrt(var / map.size() - Math.pow(sum / map.size(), 2)) + ", maximum of histogram:" + maximum); } } else if (histDim == 1) { if ((max[0] - min[0] + 1) > 0) { histogram1D = new int[max[0] - min[0] + 1]; for (int i = 0; i < max[0] - min[0] + 1; i++) histogram1D[i] = 0; System.out.println("#Histogram: " + " std. dev. = " + Math.sqrt(var / histogram1D.length - Math.pow(sum / histogram1D.length, 2)) + ", std. dev. of non-zero buckets = " + Math.sqrt(var / map.size() - Math.pow(sum / map.size(), 2)) + ", maximum of histogram:" + maximum); } } else System.out.println("#Histogram: " + " std. dev. of non-zero buckets = " + Math.sqrt(var / map.size() - Math.pow(sum / map.size(), 2)) + ", maximum of histogram:" + maximum); p = map.entrySet().iterator(); while (p.hasNext()) { entry = (Map.Entry) p.next(); key = (String) entry.getKey(); if (histDim == 1) { histogram1D[(int) key.charAt(0) - min[0]] = ((BigInteger) entry .getValue()).intValue(); } else if (histDim == 2) { histogram2D[(int) key.charAt(0) - min[0]][(int) key.charAt(1) - min[1]] = ((BigInteger) entry.getValue()).intValue(); } else { System.out.print("("); for (int i = 0; i < histDim; i++) System.out.print((int) key.charAt(i) + ", "); System.out.println("): " + entry.getValue()); } } // output the 2-d or 1-d histogram if (histDim == 2) { if (histogram2D != null) { for (int i = 0; i < histogram2D.length; i++) { // System.out.print("#, "); for (j = 0; j < histogram2D[0].length; j++) System.out.print(histogram2D[i][j] + ",\t"); System.out.println(); } /* * System.out.println("# for gnuplot"); if ( (min[0] > 0 ) || * (min[1] >0) ) * System.out.println("# warning! min is not 0!!"); for (int * i=0; i<histogram2D.length; i++) { for ( j=0; j< * histogram2D[0].length; j++) if (histogram2D[i][j] >0) * System.out.println( (i +min[0]) + " " + (j+ min[1]) + " " + * histogram2D[i][j]); } */ } } else if (histDim == 1) { if (histogram1D != null) { // System.out.print("#, "); for (int i = 0; i < histogram1D.length; i++) System.out.print((i + min[0]) + ": " + histogram1D[i] + ", "); System.out.println(); /* * System.out.println("# for gnuplot"); if ( min[0] > 0 ) * System.out.println("# warning! min is not 0!!"); for (int * i=0; i<histogram1D.length; i++) System.out.println( (i + * min[0]) + " " + histogram1D[i] ); */ } } System.out.println(); } /** * Read a file consisting of coordiantes of points in pivot-space, i.e. * distances to each pivot. The file format is the same as those required by * other methods. the file should be a text file. The first part, separted * by whitespace, of the first line should be a number, the number of * pivots. then each following line is the description of a pivot, in the * order. then an empty line. then, the first part of next line should be a * number, number of points then each following line should be the * coordinates of a point, separted by comma. in the command line, the first * parameter should be the file name, the second and third should be * sequential id of dimensions to compute histogram, start from 0. the * output is the 2-d matrix histogram of the two designated dimensions the * out put is to the screen. * * @param args * the first should be the file name, the second and third should * be sequential id of dimensions to compute histogram, start * from 0. */ public static void pivotSpaceTwoDHistogram(String[] args) throws Exception { // read file final int xDim = Integer.parseInt(args[1]); final int yDim = Integer.parseInt(args[2]); @SuppressWarnings("resource") BufferedReader fileReader = new BufferedReader(new FileReader(args[0])); // read the pivots String line = fileReader.readLine().trim(); String[] lineSegment = line.split("[ \t\n\f\r]"); final int pivotNumber = Integer.parseInt(lineSegment[0]); // skip all the pivots and the empty line following them for (int i = 0; i < pivotNumber + 1; i++) fileReader.readLine(); // read dataset size line = fileReader.readLine().trim(); lineSegment = line.split("[ \t\n\f\r]"); final int dataSize = Integer.parseInt(lineSegment[0]); double[] xValue = new double[dataSize]; double[] yValue = new double[dataSize]; for (int j = 0; j < dataSize; j++) { line = fileReader.readLine().trim(); lineSegment = line.split("[ \t\n\f\r]*,[ \t\n\f\r]*"); xValue[j] = Double.parseDouble(lineSegment[xDim]); yValue[j] = Double.parseDouble(lineSegment[yDim]); } int[][] histogram = TwoDHist(0, 40, 40, xValue, 0, 40, 40, yValue); System.out.println("2-d histogram in pivot space, p0 and p2"); for (int i = 0; i < histogram.length; i++) { for (int j = 0; j < histogram[i].length; j++) System.out.print(histogram[i][j] + ", "); System.out.println(); } System.out.println(); } /** * Read a file consisting of coordiantes of points in pivot-space, i.e. * distances to each pivot. This distance values are real number, * double/float the file should be a text file. The first part, separted by * whitespace, of the first line should be a number, the number of pivots. * then each following line is the description of a pivot, in the order. * then an empty line. then, the first part of next line should be a number, * number of points then each following line should be the coordinates of a * point, separted by comma. in the command line, the first parameter should * be the file name, then a series of sets of parameters for each dimension * sequentially, if not enough sets of parameters are provided, later * dimensions are ignored. a set of parameters for a dimension can be of one * of the 3 formats: format 1: two double values separated by white space, * means this dimension is fixed, in the range provided by the two doubles, * first should not be larger than the second, left end inclusive, right end * exclusive. format 2: "-", indicating this dimension is ignored format 3: * "." followed by a number, indicating that this dimension is subject to * histogram computation, and the number is the width of the bin. min, max * values of each dimension to compute histogram will be reported. if only * one dimension to compute histogram, the histogram will be out put in one * line, if two, then a matrix, if more, then each non-zero tuple will be * output. the out put is to the screen. * * @param args * the first should be the file name, then width of bin. then a * series of numbers or chars, x0, x1, ..., xn, corresponding to * dimensions d0, d1, ..., dn. */ public static String continuousPivotSpaceHistogram(String[] args) throws Exception { return continuousPivotSpaceHistogram(args, true); } @SuppressWarnings({ "unchecked", "rawtypes" }) public static String continuousPivotSpaceHistogram(String[] args, boolean print) throws Exception { // read file BufferedReader fileReader = new BufferedReader(new FileReader(args[0])); // read the pivots String line = fileReader.readLine().trim(); String[] lineSegment = line.split("[ \t\n\f\r]"); final int pivotNumber = Integer.parseInt(lineSegment[0]); // read all the pivots and the empty line following them String[] pivot = new String[pivotNumber]; for (int i = 0; i < pivotNumber; i++) pivot[i] = fileReader.readLine(); fileReader.readLine(); // empty line // read dataset size line = fileReader.readLine().trim(); lineSegment = line.split("[ \t\n\f\r]"); final int dataSize = Integer.parseInt(lineSegment[0]); // process parameters double[] lowerBound = new double[pivotNumber]; // for fixed dims, stores // lower(inclusive) / // upper(exclusive) bound // of the bin, double[] upperBound = new double[pivotNumber]; // for dims to computes, // stores min / max // value. double[] width = new double[pivotNumber]; boolean[] isFixed = new boolean[pivotNumber]; boolean[] toCompute = new boolean[pivotNumber]; for (int i = 0; i < pivotNumber; i++) { lowerBound[i] = Double.POSITIVE_INFINITY; upperBound[i] = Double.NEGATIVE_INFINITY; isFixed[i] = false; toCompute[i] = false; } int parameter = 1; int pivotInvolved = 0; while ((parameter < args.length) && (pivotInvolved < pivotNumber)) { if (args[parameter].equalsIgnoreCase(".")) // this is a dimension to // compute histogram { toCompute[pivotInvolved] = true; parameter++; width[pivotInvolved] = Double.parseDouble(args[parameter]); } else if (!args[parameter].equalsIgnoreCase("-")) { isFixed[pivotInvolved] = true; lowerBound[pivotInvolved] = Double.parseDouble(args[parameter]); parameter++; upperBound[pivotInvolved] = Double.parseDouble(args[parameter]); } pivotInvolved++; parameter++; } int fixedDim = 0; // number of fixed dims int histDim = 0; // number of dims to compute histogram for (int i = 0; i < pivotNumber; i++) { if (isFixed[i]) fixedDim++; if (toCompute[i]) histDim++; } int[] fixedMapping = new int[fixedDim]; // fixedMapping[i] is the dim id // of ith dim that is // fixed int[] histMapping = new int[histDim]; // histMapping[i] is the dim id of // ith dim to // compute histogram fixedDim = 0; // serve as loop variable, counter histDim = 0; for (int i = 0; i < pivotNumber; i++) { if (isFixed[i]) { fixedMapping[fixedDim] = i; fixedDim++; } if (toCompute[i]) { histMapping[histDim] = i; histDim++; } } // read (only the necessary part of valid) data into arrays, get // lower/upper bounds of the // dimensions to compute histogram double[][] distance = new double[histDim][dataSize]; int counter = 0; // the counter on valid points boolean valid = false; // whether current point is consistent with the // fixed values double temp = 0; for (int i = 0; i < dataSize; i++) { line = fileReader.readLine().trim(); lineSegment = line.split("[ \t\n\f\r]*,[ \t\n\f\r]*"); // check consistency valid = true; for (int j = 0; j < fixedDim; j++) { temp = Double.parseDouble(lineSegment[fixedMapping[j]]); if ((temp < lowerBound[fixedMapping[j]]) || (temp >= upperBound[fixedMapping[j]])) { valid = false; break; } } if (!valid) continue; for (int j = 0; j < histDim; j++) { distance[j][counter] = Double .parseDouble(lineSegment[histMapping[j]]); if (distance[j][counter] < lowerBound[histMapping[j]]) lowerBound[histMapping[j]] = distance[j][counter]; if (distance[j][counter] > upperBound[histMapping[j]]) upperBound[histMapping[j]] = distance[j][counter]; } counter++; } fileReader.close(); // scan data, compute histogram TreeMap<String, BigInteger> map = new TreeMap<String, BigInteger>(); TreeMap<Integer, BigInteger>[] histMap = new TreeMap[histDim]; // a map // for // each // dimension // to // compute // histogram. for (int i = 0; i < histDim; i++) histMap[i] = new TreeMap<Integer, BigInteger>(); String key = new String(""); StringBuffer keyBuffer = null; int offset = 0; // offset in each dimension Integer offsetKey = null; for (int i = 0; i < counter; i++) { keyBuffer = new StringBuffer(); for (int j = 0; j < histDim; j++) { offset = (int) ((distance[j][i] - lowerBound[histMapping[j]]) / width[histMapping[j]]); keyBuffer.append((char) offset); offsetKey = new Integer(offset); if (histMap[j].containsKey(offsetKey)) histMap[j].put(offsetKey, ((BigInteger) histMap[j] .get(offsetKey)).add(BigInteger.ONE)); else histMap[j].put(offsetKey, new BigInteger("1")); } key = new String(keyBuffer); if (map.containsKey(key)) map.put(key, ((BigInteger) map.get(key)).add(BigInteger.ONE)); else map.put(key, new BigInteger("1")); } // compute variance of bins Iterator p = map.entrySet().iterator(); Map.Entry entry = null; double var = 0; // variance int max = 0; // max bin size double sum = 0; // total number of points in histogram double entropy = 0; while (p.hasNext()) { entry = (Map.Entry) p.next(); key = (String) entry.getKey(); temp = ((BigInteger) entry.getValue()).intValue(); if (max < temp) max = (int) temp; sum += temp; var += temp * temp; entropy += temp * Math.log(temp); } // compute variances of distances, and entropy, of each dimension double[] distVar = new double[histDim]; double[] distSum = new double[histDim]; double[] histEntropy = new double[histDim]; for (int i = 0; i < histDim; i++) { distVar[i] = 0; distSum[i] = 0; histEntropy[i] = 0; } for (int i = 0; i < counter; i++) for (int j = 0; j < histDim; j++) { distSum[j] += distance[j][i]; distVar[j] += distance[j][i] * distance[j][i]; } // compute entropy for each dim for (int i = 0; i < histDim; i++) { distSum[i] /= counter; distVar[i] = Math.sqrt(distVar[i] / counter - Math.pow(distSum[i], 2)); p = histMap[i].entrySet().iterator(); while (p.hasNext()) { entry = (Map.Entry) p.next(); temp = ((BigInteger) entry.getValue()).intValue(); histEntropy[i] += temp * Math.log(temp); } histEntropy[i] = Math.log(counter) - histEntropy[i] / counter; } // output: statistics StringBuffer result = new StringBuffer(); StringBuffer dimResult = new StringBuffer(); // results for each dim if (print) { System.out.println("Fixed pivot-distances: "); for (int i = 0; i < fixedDim; i++) System.out.println("pivot " + pivot[fixedMapping[i]] + " = (" + lowerBound[fixedMapping[i]] + " ~ " + upperBound[fixedMapping[i]] + ") "); System.out .println("[min, max, average, std.dev, entropy]: width values of each dimension to compute histogram: "); } for (int i = 0; i < histDim; i++) { if (print) System.out.println("pivot " + pivot[histMapping[i]] + ": [" + lowerBound[histMapping[i]] + ", " + distSum[i] + ", " + upperBound[histMapping[i]] + ", " + distVar[i] + ", " + histEntropy[i] + " ]: " + width[histMapping[i]]); dimResult.append(pivot[histMapping[i]].split(":")[1].trim() + " [ " + (upperBound[histMapping[i]] - lowerBound[histMapping[i]]) + " : " + lowerBound[histMapping[i]] + " " + distSum[i] + " " + upperBound[histMapping[i]] + " " + distVar[i] + " " + histEntropy[i] + " ] "); } // sum of all variance distVar[0] *= distVar[0]; for (int i = 1; i < histDim; i++) { distVar[0] += distVar[i] * distVar[i]; histEntropy[0] += histEntropy[i]; } if (print) { System.out.println("Sum of variance, entropy, on all dimensions =" + distVar[0] + ", e: " + histEntropy[0]); System.out.println("map size = " + map.size() + ", dataset size = " + (int) sum + ", pivot number = " + pivotNumber + ", max bin size = " + max); } result.append(map.size() + " " + max); int totalBin = 1; for (int i = 0; i < histDim; i++) totalBin *= (int) ((upperBound[histMapping[i]] - lowerBound[histMapping[i]]) / width[histMapping[i]]) + 1; if (print) System.out.println("Std. dev. among non-empty bins = " + Math.sqrt(var / map.size() - Math.pow(sum / map.size(), 2)) + ", among all bins = " + Math.sqrt(var / totalBin - Math.pow(sum / totalBin, 2)) + ", 2-raw moment = " + (var / counter) / counter + ", entropy = " + (Math.log(counter) - entropy / counter)); result.append(" " + (Math.log(counter) - entropy / counter) + " " + histEntropy[0] + " " + (var / counter) / counter + " " + Math.sqrt(var / map.size() - Math.pow(sum / map.size(), 2)) + " " + Math.sqrt(var / totalBin - Math.pow(sum / totalBin, 2)) + " "); result.append(dimResult); if (!print) return result.toString(); // out put: the histogram int[][] histogram2D = null; int[] histogram1D = null; if (histDim == 2) { if ((upperBound[histMapping[0]] >= lowerBound[histMapping[0]]) && (upperBound[histMapping[1]] >= lowerBound[histMapping[1]])) { histogram2D = new int[(int) ((upperBound[histMapping[0]] - lowerBound[histMapping[0]]) / width[histMapping[0]]) + 1][(int) ((upperBound[histMapping[1]] - lowerBound[histMapping[1]]) / width[histMapping[1]]) + 1]; for (int i = 0; i < histogram2D.length; i++) for (int j = 0; j < histogram2D[0].length; j++) histogram2D[i][j] = 0; } } else if (histDim == 1) { if (upperBound[histMapping[0]] >= lowerBound[histMapping[0]]) { histogram1D = new int[(int) ((upperBound[histMapping[0]] - lowerBound[histMapping[0]]) / width[histMapping[0]]) + 1]; for (int i = 0; i < histogram1D.length; i++) histogram1D[i] = 0; } } p = map.entrySet().iterator(); while (p.hasNext()) { entry = (Map.Entry) p.next(); key = (String) entry.getKey(); if (histDim == 1) { histogram1D[(int) key.charAt(0)] = ((BigInteger) entry .getValue()).intValue(); } else if (histDim == 2) { histogram2D[(int) key.charAt(0)][(int) key.charAt(1)] = ((BigInteger) entry .getValue()).intValue(); } else { /* * System.out.print( "("); for (int i=0; i< histDim; i++) * System.out.print( (int) key.charAt(i) + ", "); * System.out.println( "): " + entry.getValue()); */ } } // output the 2-d or 1-d histogram if (histDim == 2) { if (histogram2D != null) { for (int i = 0; i < histogram2D.length; i++) { for (int j = 0; j < histogram2D[0].length; j++) System.out.print(histogram2D[i][j] + ",\t"); System.out.println(); } } } else if (histDim == 1) { if (histogram1D != null) { for (int i = 0; i < histogram1D.length; i++) System.out.print(i + ": " + histogram1D[i] + ", "); System.out.println(); } } System.out.println(); // System.out.println(result); return result.toString(); } static void aminoacid(String[] args) throws Exception { String all = "acdefghiklmnpqrstvwy"; String fileName = args[0]; StringBuffer result = new StringBuffer(); StringBuffer parameter = new StringBuffer(); final int total = 20; for (int i = 0; i < total; i++) { for (int j = i + 1; j < total; j++) { parameter = new StringBuffer(fileName); for (int k = 0; k < i; k++) parameter.append(" -"); parameter.append(" . 1"); for (int k = i + 1; k < j; k++) parameter.append(" -"); parameter.append(" . 1"); result.append(all.charAt(i)) .append(all.charAt(j)) .append(" : " + continuousPivotSpaceHistogram(parameter .toString().split(" ")) + "\n"); System.out.println(result.toString()); // System.out.println( all.charAt(i) + all.charAt(j) + " : " + // parameter.toString() // ); } } } public static double allPairs(String[] args) throws Exception { return allPairs(args, true); } public static double allPairs(String[] args, boolean print) throws Exception { String fileName = args[0]; final int dim = Integer.parseInt(args[1]); final double binWidth = Double.parseDouble(args[2]); StringBuffer result = new StringBuffer(); StringBuffer parameter = new StringBuffer(); double max = 0, temp; String tempString; for (int i = 0; i < dim - 1; i++) { for (int j = i + 1; j < dim; j++) { parameter = new StringBuffer(fileName); for (int k = 0; k < i; k++) parameter.append(" -"); parameter.append(" . ").append(binWidth); for (int k = i + 1; k < j; k++) parameter.append(" -"); parameter.append(" . ").append(binWidth); tempString = continuousPivotSpaceHistogram(parameter.toString() .split(" "), print); temp = Double.parseDouble(tempString.split(" ")[2]); max = (temp > max) ? temp : max; result.append(i).append(" " + j) .append(" : " + tempString + "\n"); if (print) System.out.println(result.toString()); // System.out.println( all.charAt(i) + all.charAt(j) + " : " + // parameter.toString() // ); } } if (!print) System.out.println(result.toString()); return max; } static void aminoacid3(String[] args) throws Exception { String all = "acdefghiklmnpqrstvwy"; String fileName = args[0]; StringBuffer result = new StringBuffer(); StringBuffer parameter = new StringBuffer(); final int total = 20; for (int i = 0; i < total; i++) { for (int j = i + 1; j < total; j++) { for (int k = j + 1; k < total; k++) { parameter = new StringBuffer(fileName); for (int t = 0; t < i; t++) parameter.append(" -"); parameter.append(" . 1"); for (int t = i + 1; t < j; t++) parameter.append(" -"); parameter.append(" . 1"); for (int t = j + 1; t < k; t++) parameter.append(" -"); parameter.append(" . 1"); result.append(all.charAt(i)) .append(all.charAt(j)) .append(all.charAt(k)) .append(" : " + continuousPivotSpaceHistogram(parameter .toString().split(" ")) + "\n"); System.out.println(result.toString()); // System.out.println( all.charAt(i) + all.charAt(j) + " : " // + // parameter.toString() ); } } } } public static void main(String[] args) throws Exception { double[][] hist = oneDHistogram(3.1, 1, new double[] { 1, 2, 3, 4, 5, 12, 30 }); for (int i = 0; i < hist[0].length; i++) System.out.print(hist[0][i] + ", "); System.out.println(); for (int i = 0; i < hist[1].length; i++) System.out.print((int) hist[1][i] + ", "); System.out.println(); // allPairs(args); // System.out.println( continuousPivotSpaceHistogram(args) ); // aminoacid(args); // aminoacid3(args); // pivotSpaceTwoDHistogram(args); /* * char [] s = new char[5]; s[0] = (char) 0; s[1] = (char) 1; s[2] = * (char) 2; s[3] = (char) 95; s[4] = (char) 102; String ss = new * String(s); for (int i=0; i< ss.length(); i++) System.out.println( * (int) ss.charAt(i) ); System.out.println(ss + ss.length()); */ } }