/*
This file is part of Cyclos (www.cyclos.org).
A project of the Social Trade Organisation (www.socialtrade.org).
Cyclos is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
Cyclos is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Cyclos; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package nl.strohalm.cyclos.services.stats;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.ListIterator;
import nl.strohalm.cyclos.entities.reports.StatisticalNumber;
import nl.strohalm.cyclos.entities.settings.LocalSettings;
import nl.strohalm.cyclos.utils.StringValuedEnum;
import nl.strohalm.cyclos.utils.statistics.ListOperations;
/**
* This class takes a simple list with data, retrieved straight from a database query, and uses it to create a StatisticalResultDTO value object for
* the creation of a Histogram. The list of data is divided into histogram categories of reasonable size for the statistics graphs.
* <p>
* <b>Beware</b>: the class cannot handle negative x-axis values. It starts counting at 0, so if the x-axis factor has negative values, then the first
* category on the x-axis will be VERY big, because it also contains all x-values < 0. In this case, no exception is raised, but the figure looks
* rather silly, with one big first category, and the rest of the x-axis categories very tiny.
* <p>
* <b>Beware 2:</b><br>
* Note that lists of Integers and Lists of Doubles are treated differently, especially where it concerns the division over x-axis categories. You
* should NOT enter a List of numbers which represent integer values as a List of Doubles. If you do so, the x-axis categories will use irrelevant
* broken numbers and your histograph may contain a lot of silly gaps..
*
* @author Rinke
*
*/
public class HistogramDTOFactory {
/**
* and enum determining the prefix for the label of the last bar in the graph.
* @author rinke
*
*/
public enum LastRowHeaderPrefix implements StringValuedEnum {
/**
* no prefix
*/
NONE(""),
/**
* a ">" sign, in case of rest categories and an input list with Doubles
*/
GREATER(">"),
/**
* a greater than or equal to sing, in case of rest categories and an input list with Integers
*/
GREATER_EQUAL("\u2265");
/**
* a String describing the value. This String value is eventually displayed in the output
*/
private final String value;
private LastRowHeaderPrefix(final String value) {
this.value = value;
}
public String getValue() {
return value;
}
}
/**
* the maximum number of bars in the histogram.
*/
private static final int MAXBARS = 39;
/**
* In the higher tail, no more cats/bars are shown, if for 2 consequetive bars a value lower than (LOW_BAR_LEVEL * highestBarValue) is found.
* Example: if LOW_BAR_LEVEL = 0.01, and the highest bar value = 200, when bar N and N+1 are < 2 (1% of 200), then iteration is stopped and all
* restvalues go in the final (extra) bar. NOTE: the number of 2 consequetive bars is not hard coded, but this is a constant too:
* ALLOWABLE_TAIL_GAP.
*/
private static final double LOW_BAR_LEVEL = 0.008;
/**
* In the higher tail, if more than ALLOWABLE_TAIL_GAP of bars have a too low value, then iteration is stopped and all restvalues go in the final
* extra bar.
*/
private static final int ALLOWABLE_TAIL_GAP = 3;
/**
* In the higher tail, the rest category should not be shown if it contains more than this fraction of all observations. So, if
* MAX_REST_CAT_PERCENTAGES = 0.10, this means that the rest category may not contain more than 10% of all observations. In such a case, the
* iteration should just continue.
*/
private static final double MAX_REST_CAT_FRACTION = 0.09;
/**
* This factor is used for calculating the optimum category width. If 70, it takes the 70 percentile as calibration point to base this calculation
* upon. Slight adjustments can give better results. Optimum value seems to be between 70 and 95. Higher value results in broader classes, but
* then (of course) a smaller number of bars. As it is a percentile (so derived from percentage) it should by definition be between 0 and 100; if
* not, an ArrayIndexOutOfBoundsException will result.
*/
private static final int CALIBRATION_PERCENTILE = 89;
private LastRowHeaderPrefix lastRowHeaderPrefix;
/**
* the resulting data value object with all the data for creating the graph. Typically, it contains the graph bar values
*/
private StatisticalResultDTO result;
/**
* The input list with all the data retrieved from the database query. This is for example a list with all the personal gross products of all
* members.
*/
private final List<Number> input;
/**
* the output list with all the category centers.
*/
private List<Number> xResult;
/**
* the output list with all results. Each number gives the amount of members found in the category. The corresponding category value is in the
* xResult list.
*/
private List<Number> yResult;
/**
* a factor indicating how the x-axis is scaled. For example: if the x-axis has values around 5000, the axis labels will show numbers like 1.0,
* 2.0, ...5.0, 6.0, and the scalefactor will be 1000. This factor should be used as a extension to the x-axis label (showing "x 1000").
*/
private double scaleFactorX;
/**
* the local settings, needed in order to format the numbers of the x axis categories according to the local settings.
*/
private final LocalSettings settings;
/**
* constructor, storing the input in its appropriate field, and storing the settings.
*
* @param input - the List<Number> with the data to put in the histogram. Note that lists of Integers and Lists of Doubles are treated
* differently, especially where it concerns the division over x-axis categories. You should NOT enter a List of numbers which represent integer
* values as a List of Doubles. If you do so, the x-axis categories will use irrelevant broken numbers and your histograph may contain a lot of
* silly gaps..
* @param settings - a LocalSettings, in order to format the numbers. Null value is allowed.
*/
public HistogramDTOFactory(final List<Number> input, final LocalSettings settings) {
this.input = input;
this.settings = settings;
scaleFactorX = 1;
}
/**
* assigning the x-axis labels to the result object. Uses the xResult field as input.
*/
private void assignRowHeaders() {
if (result == null) {
return;
}
final String[] rowHeaders = new String[xResult.size()];
if (settings == null) {
for (int i = 0; i < xResult.size(); i++) {
rowHeaders[i] = xResult.get(i).toString();
}
} else {
for (int i = 0; i < xResult.size(); i++) {
rowHeaders[i] = settings.getNumberConverterForPrecision(1).toString(new BigDecimal(xResult.get(i).floatValue()));
}
}
rowHeaders[xResult.size() - 1] = lastRowHeaderPrefix.getValue() + rowHeaders[xResult.size() - 1];
result.setRowHeaders(rowHeaders);
}
/**
* complicated method to calculate optimum classwidth. The method tries to find a nice round value.
*
* @param calibrationValue a double representing the value of the CALIBRATION_PERCENTILE index. This is the value for which about 80% of the list
* is lesser, and about 20% is bigger.
* @param integers true if the list contains integers.
* @return the optimal class width for each bar in the graph.
*/
private double calcClassWidth(final double calibrationValue, final boolean integers) {
if (calibrationValue == 0.0) {
scaleFactorX = 1.0;
return 1.0;
}
final double divideBy = Math.floor(logBase(10, calibrationValue));
scaleFactorX = Math.pow(10, divideBy);
final double superRoundedValue = Math.ceil(calibrationValue / scaleFactorX);
double width = superRoundedValue * (Math.pow(10, divideBy - 1));
if (integers) {
width = (width >= 0.5) ? Math.round(width) : 1.0;
}
// rescale scalefactor to 1000's in stead of to 10's
// scaleFactorX = Math.pow(1000, Math.floor(logBase(10, scaleFactorX) / 3));
return width;
}
/**
* scales a number according to the scaleFactorX. This makes that x-axis labels will be shown as for example 2.0 and not 200, and the x-axis will
* then be labeled with "(x 100)"
*/
private double formatX(final double number) {
return number / scaleFactorX;
}
/**
* formats the scalefactor field into a String which can be used as a suffix to the x-axis label
*
* @param baseKey a String which is the base key for the language resource bundle. All Strings to be built for the rendered graph will be built
* with this key as the basis.
* @return a string, showing for example "( x 1000)" if the x-axis categories are in thousands
*/
private String getScaleFactorString(final String baseKey) {
if (xResult == null) {
produceResultArrays(baseKey);
}
if (scaleFactorX == 1) {
return "";
} else if (scaleFactorX < 1) {
return " ( / " + (int) (1 / scaleFactorX) + ")";
} else {
return " ( x " + (int) scaleFactorX + ")";
}
}
/**
* simple helper function to calculate (base)log(value). Needed in order to calculate optimum barClass width
*
* @param base, the base of the logarithm
* @param value, the number you want to take the logarithm from
* @return (base)log(value)
*/
private double logBase(final double base, final double value) {
return Math.log(value) / Math.log(base);
}
/**
* this method turns the results of the main calculations into a descent usable StatisticalResultDTO
*
* @param baseKey a String which is the base key for the language resource bundle. All Strings to be built for the rendered graph will be built
* with this key as the basis.
*/
private void produceDTO(final String baseKey) {
final Number[][] data = new Number[yResult.size()][1];
for (int i = 0; i < yResult.size(); i++) {
data[i][0] = new StatisticalNumber(yResult.get(i).doubleValue(), (byte) 0);
}
result = new StatisticalResultDTO(data);
result.setGraphType(StatisticalResultDTO.GraphType.BAR);
assignRowHeaders();
result.setScaleFactorX(getScaleFactorString(baseKey));
result.setShowTable(false);
}
/**
* creates a <code>StatisticalResultDTO</code> with just one bar. Called if all values in the list are equal.
*
* @param baseKey the baseKey as used by the language resource bundle in order to create language labels.
*/
private void produceJustOneBarDTO(final String baseKey) {
xResult.add(formatX(input.get(0).doubleValue()));
yResult.add(input.size());
lastRowHeaderPrefix = LastRowHeaderPrefix.NONE;
produceDTO(baseKey);
}
/**
* method doing all the calculations. It calculates the optimum class width, and after that, it spreads all the results over the bars in the
* graph, and creates a <code>StatisticalResultDTO</code> ready to use by the <code>StatisticalDataProducer</code> and action. Method is
* highly commented because the algorithm and calculation is rather complicated.
*
* @param baseKey the baseKey as used by the language resource bundle in order to create language labels.
*/
private void produceResultArrays(final String baseKey) {
// if too small set of data don't do anything
if (input.size() < StatisticalService.MINIMUM_NUMBER_OF_VALUES) {
produceTooSmallDatasetDTO(baseKey);
return;
}
yResult = new ArrayList<Number>();
xResult = new ArrayList<Number>();
// sort input
final List<Double> lInput = ListOperations.convertToDoubleList(input);
Collections.sort(lInput);
// if only one bar, just use that value as x-axis label and exit, skipping all complicated stuff below
if (lInput.get(0).equals(lInput.get(lInput.size() - 1))) {
produceJustOneBarDTO(baseKey);
return;
}
// a list of integers is treated slightly different. Class width for integers should always be an integer value
final boolean integers = (input.get(0).getClass() == Integer.class);
// determine optimum classWidth via the 70-percentile
final int calibrationPoint = (int) (Math.round(CALIBRATION_PERCENTILE * lInput.size() / 100.0) - 1);
final double calibrationValue = lInput.get(calibrationPoint);
final double classwidth = calcClassWidth(calibrationValue, integers);
// initialize counters and indexes
int maxElementIndexThisBar = 0;
byte lows = 0;
double highest = 100;
int barIndex;
for (barIndex = 0; barIndex < MAXBARS; barIndex++) {
final ListIterator<Double> it = lInput.listIterator(maxElementIndexThisBar);
// elementIndex is the basic counter when iterating over the inputlist
// At the start of the iteration loop, it is set to the maximum index remaining from the previous loop
int elementIndex = maxElementIndexThisBar;
// maxElementIndexLastBar keeps track of how many elements were already placed in previous bars
final int maxElementIndexLastBar = maxElementIndexThisBar;
// the value of the element. Initialized to 0 because the compiler wants initialization. It will be reassigned by definition
double value = 0;
while (it.hasNext()) {
value = it.next();
// if the first value belonging to the next bar is reached, stop iterating
if (value >= ((barIndex + 1) * classwidth)) {
// but before stopping iteration, of course set the maximum element which has been reached
maxElementIndexThisBar = elementIndex;
break;
}
elementIndex++;
}
// the loop above stops and resets maxElementIndexThisBar if the first element for the next bar is reached.
// if there is no next bar, because of reaching the end of the input list, this would go wrong.
// Therefore, check if the last element's value would fall in the present bar
// see LocalSettings.BIG_DECIMAL_DIVISION_PRECISION (is 6)
if (!it.hasNext() && (Math.abs(value - barIndex * classwidth) < 0.000001)) {
// if so, put remaining elements in next bar by setting maxElementIndexThisBar. This takes care that the next block will write it
maxElementIndexThisBar = elementIndex;
}
// if no more elements, and if all elements have been assigned to bars, don't write
final boolean dontwrite = ((it.hasNext() == false) && (maxElementIndexThisBar - maxElementIndexLastBar == 0));
if (!dontwrite) {
if (integers) {
xResult.add(formatX(Math.floor((barIndex + 0.5) * classwidth)));
} else {
xResult.add(formatX((barIndex + 0.5) * classwidth));
}
yResult.add(maxElementIndexThisBar - maxElementIndexLastBar);
if ((maxElementIndexThisBar - maxElementIndexLastBar) > highest) {
highest = (maxElementIndexThisBar - maxElementIndexLastBar);
}
}
// if no more elements, previous block took care of the last assignment to a bar. So the loop can safely stop if no more elements
if (!it.hasNext()) {
break;
}
// count the number of consequetive bars with LOW_BAR_LEVEL.
lows = (byte) (((maxElementIndexThisBar - maxElementIndexLastBar) < (highest * LOW_BAR_LEVEL)) ? (lows + 1) : 0);
// If too much consequetive bars with low values are found at the tail, iteration should stop.
// Conditions for stopping:
// 1) enough consequetive low value bars (>= ALLOWABLE_TAIL_GAP)
// 2) should be in the tail, so beyond the calibrationPoint (so, after having had CALIBRATION_PERCENTILE of the elements (about 70% of the
// elements).
// 3) the remaining fraction may not be bigger than MAX_REST_CAT_FRACTION
if (lows >= ALLOWABLE_TAIL_GAP && maxElementIndexThisBar > calibrationPoint) {
final double remainingFraction = ((double) (input.size() - maxElementIndexThisBar)) / (double) input.size();
if (remainingFraction < MAX_REST_CAT_FRACTION) {
break; // only stop if the restcategory does not contain more than MAX_REST_CAT_FRACTION
}
}
}
// and the last element (if any)
final int remaining = input.size() - maxElementIndexThisBar;
if (remaining > 0) { // only add last element if anything is left
xResult.add(formatX((barIndex + 1) * classwidth));
yResult.add(remaining);
lastRowHeaderPrefix = integers ? LastRowHeaderPrefix.GREATER_EQUAL : LastRowHeaderPrefix.GREATER;
} else {
lastRowHeaderPrefix = LastRowHeaderPrefix.NONE;
}
// Make a DTO object from the results
produceDTO(baseKey);
}
/**
* this method takes care of too small datasets; in such a case, it does not show anything, just a message.
*
* @param baseKey the baseKey as used by the language resource bundle in order to create language labels.
*/
private void produceTooSmallDatasetDTO(final String baseKey) {
result = StatisticalResultDTO.noDataAvailable(baseKey);
}
/**
* the main public method, call this to receive a value object containing the histogram data
*
* @param baseKey a String which is the base key for the language resource bundle. All Strings to be built for the rendered graph will be built
* with this key as the basis.
* @return a StatisticalResultDTO containing the histogram data.
*/
StatisticalResultDTO getResultObject(final String baseKey) {
if (result == null) {
produceResultArrays(baseKey);
}
result.setBaseKey(baseKey);
return result;
}
}