/*
* Copyright 2003-2010 Tufts University Licensed under the
* Educational Community License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License. You may
* obtain a copy of the License at
*
* http://www.osedu.org/licenses/ECL-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an "AS IS"
* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package tufts.vue.ds;
import tufts.Util;
import java.util.*;
import java.text.DateFormat;
import java.text.NumberFormat;
import java.text.DecimalFormat;
import tufts.vue.LWComponent;
import tufts.vue.DEBUG;
import com.google.common.collect.*;
import org.apache.commons.lang.StringEscapeUtils;
/**
* Represents a column in a data-set, or pseudo-column from an XML mapped data-set.
*
* Besides simply recording the name of the column, this class mainly provides
* data-analysis of on all the values found in the column, discovering enumerated
* types and doing some data-type analysis. It also includes the ability to
* associate a LWComponent node style with specially marked values.
*
* @version $Revision: 1.25 $ / $Date: 2010-02-03 19:13:16 $ / $Author: mike $
* @author Scott Fraize
*/
public class Field implements tufts.vue.XMLUnmarshalListener
{
private static final org.apache.log4j.Logger Log = org.apache.log4j.Logger.getLogger(Field.class);
public static final String EMPTY_VALUE = "";
public static final String TYPE_TEXT = "TEXT";
public static final String TYPE_INTEGER = "INTEGER";
public static final String TYPE_DECIMAL = "DECIMAL";
public static final String TYPE_DATE = "DATE";
public static final String TYPE_QUANTILE = "QUANTILE";
private static final int MAX_ENUM_VALUE_LENGTH = 192;
private static final int MAX_DATE_VALUE_LENGTH = 40;
private static final DateFormat DateParser = DateFormat.getDateTimeInstance();
private Schema schema; // should be final, but not due to castor persistance
private String name;
/** the number of actual (non-empty) values that have been inspected for analysis */
private int mValuesSeen;
/** the string length of the longest value seen */
private int mMaxValueLen;
/** if true, all values found were unique -- there were no repeated values */
private boolean mAllValuesUnique;
/** if true, the values were too long to meaninfully track and enumerate */
private boolean mValueTrackDisabled;
/** map of all possible unique values for enumeration tracking */
private final Multiset<String> mValues = LinkedHashMultiset.create();
private String mType = TYPE_INTEGER; // starts most specific as default, is cleared upon finding anything else
private boolean mTypeDetermined = false;
private final Collection<String> mDataComments = new ArrayList();
/** map of values currently present in a given context (e.g., a VUE map) */
private Multiset<String> mContextValues;
private LWComponent mNodeStyle;
//========================================================================================
// These variables are only relevant to Fields numeric type:
private static final int QUANTILE_BUCKETS = 4; // # of quantile ranges to create (4=quartiles, 5=quintiles, etc)
private double mMinValue = Double.MAX_VALUE;
private double mMaxValue = Double.MIN_VALUE;
private double mValuesTotal;
private double mMeanValue;
private double mMedianValue;
private double mStandardDeviation;
private double[] mQuantiles;
private boolean mAllValuesAreIntegers = true; // defaults true: won't be valid until final analysis
//========================================================================================
private transient boolean mXMLRestoreUnderway;
/**
* A persistant reference to a Field for storing associations in maps via castor.
* Note variable names in this class don't have more than one cap letter to best
* work with castor auto-mappings. Changing the variable names here will break
* persistance for previously stored associations under the old names.
*/
public static final class PersistRef {
public String
fieldName,
schemaName,
schemaId,
schemaGuid,
schemaDsguid;
@Override public String toString() {
return String.format("FieldRef[%s.%s %s/%s]", schemaName, fieldName, schemaId, schemaGuid);
}
public PersistRef() {} // for castor
PersistRef(Field field) {
final Schema s = field.getSchema();
schemaId = s.getMapLocalID();
schemaGuid = s.getGUID();
schemaDsguid = s.getDSGUID();
schemaName = s.getName();
fieldName = field.getName();
}
}
/** for castor persistance */
public Field() {
this.name = "<empty>";
}
private transient Collection<PersistRef> mRelatedFields;
Collection<PersistRef> getRelatedFieldRefs() {
return mRelatedFields;
}
Field(String n, Schema schema) {
this.name = n;
setSchema(schema);
flushStats(true);
if (DEBUG.SCHEMA) {
Log.debug("instanced " + Util.tags(this));
//Log.debug("instanced " + Util.tags(this), new Throwable("HERE"));
}
}
// /** for castor persistance */
// public final String getMapLocalID() {
// return String.format("%s.%s", schema.getMapLocalID(), name);
// }
/** must be called by parent Schema after de-serialization (needed for persistance) */
void setSchema(Schema s) {
this.schema = s;
}
/** for persistance of associations */
public Collection<PersistRef> getRelatedFields() {
if (mXMLRestoreUnderway) {
Log.debug("RETURNING RELATED FIELDS for " + this);
return mRelatedFields;
} else {
Collection<PersistRef> persists = new ArrayList();
for (Field f : Association.getPairedFields(this)) {
persists.add(new PersistRef(f));
}
if (DEBUG.SCHEMA && persists.size() > 0) {
Log.debug(this + ": GOT RELATED FIELDS: " + Util.tags(persists));
}
return persists;
}
}
/** interface {@link XMLUnmarshalListener} -- init */
public void XML_initialized(Object context) {
mXMLRestoreUnderway = true;
mRelatedFields = new HashSet();
}
/** interface {@link XMLUnmarshalListener} -- track us */
public void XML_completed(Object context) {
mXMLRestoreUnderway = false;
if (mRelatedFields.size() > 0) {
if (DEBUG.Enabled) {
Log.debug("GOT RELATED FIELDS for " + this);
Util.dump(mRelatedFields);
}
// todo: later, process to re-construct associations
} else
mRelatedFields = Collections.EMPTY_LIST;
}
/** Wrapper for display of special values: e.g., EMPTY_VALUE ("") to "(no value)" */
public static String valueText(Object value) {
if (value == null)
return null;
else if (value == EMPTY_VALUE)
return "(no value)";
else
return value.toString();
}
public String valueDisplay(Object value) {
final String display;
if (isQuantile()) {
display = getName() + ": " + StringEscapeUtils.escapeHtml(valueText(value));
} else {
display = StringEscapeUtils.escapeHtml(valueText(value));
}
//Log.debug(this + "; valueDisplay: " + value + " -> " + Util.tags(display));
return display;
}
public int countValues(String value) {
return mValues.count(value);
}
void annotateIncludedValues(final Collection<LWComponent> nodes) {
if (mValues == null || count(mValues) < 1) {
if (mContextValues != null)
mContextValues.clear();
return;
}
if (mContextValues == null)
mContextValues = HashMultiset.create();
else
mContextValues.clear();
if (DEBUG.META) Log.debug("MARKING INCLUDED VALUES AGAINST " + nodes.size() + " NODES for " + this);
final Set<String> valuesToCheck = mValues.elementSet();
for (LWComponent c : nodes) {
for (String value : valuesToCheck) {
//if (c.getDataSchema() == schema && c.hasDataValue(this.name, value)) {
if (c.hasDataValue(this.name, value)) {
//if (!c.isDataValueNode()) // SMF - changed to allow data-value nodes 2009-10-04
mContextValues.add(value);
//Log.debug(String.format("found in context: %s=[%s], count=%d", this.name, value, mContextValues.count(value)));
}
}
// final Iterator<String> i = valuesToCheck.iterator();
// while (i.hasNext()) {
// final String value = i.next();
// if (c.isSchematicFieldNode() && c.hasDataValue(this.name, value)) {
// //Log.debug(String.format("found in context: %s=[%s]", this.name, value));
// mContextValues.add(value);
// i.remove();
// }
// }
// if (valuesToCheck.size() < 1) {
// //Log.debug(this + "; no more values to check, found: " + mContextValues);
// Log.debug(String.format("all %d data-set values found on the map, done marking early for [%s]",
// mValues.size(),
// this.name));
// if (mContextValues.size() != mValues.size())
// Log.error(new Throwable(String.format("context values %d != data-set values size %d in [%s]",
// mContextValues.size(),
// mValues.size(),
// this.name)));
// // Log.debug(String.format("all values discovered, found %3d on-map out of %3d in data-set [%s]",
// // mContextValues.size(),
// // mValues.size(),
// // this.name));
// break;
// }
}
}
public boolean hasContextValue(String value) {
return mContextValues != null && mContextValues.contains(value);
}
public int countContextValue(String value) {
return mContextValues == null ? 0 : mContextValues.count(value);
}
public int getContextValueCount() {
return mContextValues == null ? 0 : mContextValues.size();
}
protected void flushStats() {
flushStats(false);
}
private void flushStats(boolean init) {
if (!init) Log.debug("flushing " + this);
// reset to initial defaults
mValues.clear();
mValuesSeen = 0;
mValueTrackDisabled = false;
mAllValuesUnique = true;
mAllValuesAreIntegers = true;
mMaxValueLen = 0;
mType = TYPE_INTEGER;
mTypeDetermined = false;
mDataComments.clear();
mMinValue = Double.MAX_VALUE;
mMaxValue = Double.MIN_VALUE;
mValuesTotal = 0;
mMeanValue = 0;
mMedianValue = 0;
mStandardDeviation = 0;
mQuantiles = null;
// we keep the nodeStyle, which is the whole reason we use a flush instead of
// just creating new Schema+Field objects when reloading. Tho at this point,
// may be easier to re-create all & just carry over the styles.
}
/** for persistance */
public void setStyleNode(LWComponent style) {
if (DEBUG.SCHEMA) Log.debug(String.format("setStyleNode %-22s%s", this, style));
// if (mNodeStyle != null)
// Log.warn("resetting field style " + this + " to " + style, new Throwable("HERE"));
mNodeStyle = style;
}
public boolean hasStyleNode() {
return mNodeStyle != null;
}
public LWComponent getStyleNode() {
return mNodeStyle;
}
public String getName() {
return name;
}
/** for castor persistance only */
public void setName(String s) {
name = s;
}
public Schema getSchema() {
return schema;
}
public String getType() {
return mType;
}
public boolean isNumeric() {
return getType() == TYPE_DECIMAL || getType() == TYPE_INTEGER;
}
private static final String NoCause = "(explicit-type-set)";
private void takeType(String type, String cause) {
if (DEBUG.Enabled) Log.debug(toTerm() + " type=>" + type + " on " + Util.tags(cause));
mType = type;
}
private void setType(String type) {
setType(type, NoCause);
}
private void setType(String type, String cause) {
takeType(type, cause);
mTypeDetermined = true;
}
public boolean isQuantile() {
return mType == TYPE_QUANTILE;
}
@Override
public String toString() {
if (schema == null)
return String.format("<?>.%s", getName());
else
return String.format("%s.%s", schema.getName(), getName());
}
public String toTerm() {
return Relation.quoteKey(this);
}
// @Override
// public String toString() {
// //if (isNumeric) type=TYPE_DECIMAL; // HACK: NEED ANALYSIS PHASE
// //return getName();
// final String numeric = isNumeric ? "/NUMERIC" : "";
// //final String name = schema.getName() + "." + getName();
// final String name = getName();
// if (mValuesSeen() == 1)
// //return String.format("<html><code>%s</code>:<br>\"%s\"", getName(), getValues().toArray()[0]);
// return String.format("%-14s=\"%s\"", name, getValues().toArray()[0]);
// else if (mAllValuesUnique)
// return String.format("%-14s (%d)/%s%s", name, mValuesSeen(), type, numeric);
// else
// return String.format("%-14s [%d]/%s%s", name, uniqueValueCount(), type, numeric);
// }
public boolean isPossibleKeyField() {
//return mAllValuesUnique && mValuesSeen == schema.getRowCount() && !(type == TYPE_DATE);
return !mValueTrackDisabled
&& mAllValuesUnique
&& uniqueValueCount() == valueCount()
&& valueCount() == schema.getRowCount()
&& !(mType == TYPE_DATE);
}
/** @return true if this is the schema's unique key field */
public boolean isKeyField() {
return schema.getKeyField() == this;
// boolean t = (schema.getKeyField() == this);
// Log.debug(String.format("isKeyField=%s %s", t ? "YES" : "no", Util.tags(this)));
// return t;
}
public boolean isUntrackedValue() {
return mValueTrackDisabled;
}
/** @return true if all the values for this Field have been fully tracked and recorded, and more than one
* unique value was found */
public boolean isEnumerated() {
return !mValueTrackDisabled && uniqueValueCount() > 1;
}
/** @return true if this field appeared a single time in the entire data set.
* This can generally only be true for fields from an XML data-set, in which a single-value
* "column" is in effect created by an XML key that only appears once, such as keys
* that apply to the entire feed.
*/
public boolean isSingleton() {
return mAllValuesUnique && (mValues != null && count(mValues) < 2);
}
/** @return true if every value found for this field has the same value.
* Will always be true if isSingleton() is true
*/
public boolean isSingleValue() {
return uniqueValueCount() == 1;
}
/** @return the instance value count: the number of a times any value appeared for this field (includes repeats) */
protected int valueCount() {
return mValuesSeen;
}
public int getEnumValuesSeen() {
return isEnumerated() ? uniqueValueCount() : -1;
}
protected int uniqueValueCount() {
if (mValues == null) {
if (mMaxValueLen == 0)
return 0;
else
return valueCount();
} else {
return count(mValues);
}
//return mValues == null ? valueCount() : mValues.entrySet().size();
}
/** @return the count of all unique values in the Multiset */
private static int count(Multiset m) {
// to fulfill the java.util.Collection contract, Multiset.size() returns the *virtual*
// count of items in the set, not the unqiue items as a counting HashMap impl would do --
// we have to actually pull the entrySet/elementSet and count that to get the count of
// unique values. Forunately, the impl appears to cache the entrySet, so it's not creating
// a new one each time. (The elementSet is also cached, tho in the current google impl, the
// entrySet has to do a tad less delegation to extract the backingMap size)
return m == null ? 0 : m.entrySet().size();
//return m == null ? 0 : m.elementSet().size();
}
public int getMaxValueLength() {
return mMaxValueLen;
}
/**
* @return the set of all unique values this Field has been seen to take amonst all rows in
* the data-set. Note that the returned set is modifiable, and should NOT be modified.
*/
public Set<String> getValues() {
return mValues.elementSet();
// note: the set from elementSet() can modify the backing Multiset
//return mValues == null ? Collections.EMPTY_SET : mValues.elementSet();
}
private static final Multiset EMPTY_MULTISET = Multisets.unmodifiableMultiset(HashMultiset.create(0));
public Multiset<String> getValueSet() {
return mValues == null ? EMPTY_MULTISET : Multisets.unmodifiableMultiset(mValues);
//return mValues == null ? EMPTY_MULTISET : mValues;
}
// public Map<String,Integer> getValueMap() {
// return mValues == null ? Collections.EMPTY_MAP : mValues;
// }
// todo: may want to move this to a separate analysis code set
void trackValue(String value) {
if (value == null)
return;
final int valueLen = value.length();
if (valueLen > mMaxValueLen)
mMaxValueLen = valueLen;
if (mValueTrackDisabled)
return;
if (value == EMPTY_VALUE) {
; // don't increment value count
} else if (valueLen == 0) {
value = EMPTY_VALUE; // don't increment value count
} else {
mValuesSeen++;
}
if (mValuesSeen > 1 && value.length() > MAX_ENUM_VALUE_LENGTH) {
mValueTrackDisabled = true;
setType(TYPE_TEXT, value);
return;
}
if (mValues.contains(value))
mAllValuesUnique = false;
mValues.add(value);
//Log.debug(this + " added " + value + "; size=" + count(mValues));
if (value == EMPTY_VALUE)
return;
if (!mTypeDetermined)
trackForTypeInference(value);
}
// the inferencing depends on not passing this method null or empty values
private void trackForTypeInference(final String text)
{
if (text.indexOf(':') > 0) {
if (isDateValue(text)) {
// THIS IS A MAJOR GUESS: we guess it's a date field if we see a single valid date
setType(TYPE_DATE, text);
} else {
// having seen a ':' but not being a date, we infer that this is text (e.g., not numeric)
setType(TYPE_TEXT, text);
}
} else {
final double number = getNumericValue(text, true);
if (Double.isNaN(number)) {
// the first non-numeric we see, mark us as text
setType(TYPE_TEXT, text);
} else {
//Log.debug(Util.tags(text) + " = " + number);
if (number < mMinValue)
mMinValue = number;
else if (number > mMaxValue)
mMaxValue = number;
mValuesTotal += number;
if (mAllValuesAreIntegers && number != (long) number) {
mAllValuesAreIntegers = false;
takeType(TYPE_DECIMAL, text); // do NOT use setType -- this is still a guess, value is not determined yet
}
}
}
}
private double getNumericValue(final String text) {
return getNumericValue(text, true);
}
// DecimalFormat's are not synchronized, thus these cannot be static.
private final NumberFormat LocalNumberFormat = NumberFormat.getInstance();
//private final NumberFormat LocalCurrencyFormat = NumberFormat.getCurrencyInstance();
/** @return double value if one found, Double.NaN otherwise */
private double getNumericValue(final String text, final boolean tryCurrency) {
try {
// Double.parseDouble handles most stuff, including "0x2F" style
// hex values was well as scientific notation.
return Double.parseDouble(text);
} catch (Throwable t) {}
Number value = null;
try {
// This handles values of the form "1,234,567". It will also extract any
// number that can be found at the head of a string: e.g. "7foo" will return
// 7, or "70%" will return 70 (*not* 0.70). The instance of LocalNumberFormat will
// generally be a DecimalFormat
value = LocalNumberFormat.parse(text);
} catch (Throwable t) {}
// Note that if we use a NumberFormat.getCurrencyInstance() here to handle
// currency, it will only allow the local currency symbol.
if (value == null && tryCurrency && text.length() > 1 && isCurrencySymbol(text.codePointAt(0))) {
value = getNumericValue(text.substring(1), false); // NOTE RECURSION
//Log.debug("HANDLED CURRENCY " + Util.tags(text) + " = " + Util.tags(value));
}
// could allow for percent parsers that return value/100
if (DEBUG.SCHEMA || DEBUG.DATA) Log.debug(Util.tags(text) + " = " + Util.tags(value));
return value == null ? Double.NaN : value.doubleValue();
}
private static boolean isCurrencySymbol(int c)
{
// checking '$' should be redundant
return c == '$' || Character.getType(c) == Character.CURRENCY_SYMBOL;
}
private static boolean isDateValue(String value) {
Date date = null;
try {
date = new Date(value);
if (DEBUG.Enabled) Log.debug("PARSED DATE: " + Util.tags(date) + " from " + value);
} catch (Throwable t) {
if (DEBUG.DATA) Log.debug("Failed to parse [" + value + "] as date: " + t);
}
// try {
// date = DateParser.parse(value);
// } catch (java.text.ParseException e) {
// eoutln("Failed to parse [" + value + "] as date: " + e);
// return false;
// }
return date != null;
}
// private static boolean isNumericValue(String value) {
// try {
// Double.parseDouble(value);
// } catch (Throwable t) {
// //if (DEBUG.SCHEMA) Log.info(t);
// return false;
// }
// return true;
// }
/** compute quantiles via median values and return the absolute median */
private static double computeQuantiles(final double[] quantiles, final double[] values) {
// Note: The quantile ranges will change depending on how the boundaries are handled
// (e.g., off-by-one differences in computing which index to use). There does not
// appear to be a commonly agreed upon method of resolving this in either direction.
Arrays.sort(values);
final boolean EVEN_REGIONS = (quantiles.length % 2 != 0);
if (DEBUG.Enabled) Log.debug("count of all possible values: " + values.length);
//for (int i = 0; i < values.length; i++) Log.debug("v" + i + ": " + values[i]);
final int regions = quantiles.length + 1;
final float range = (float) values.length / (float) regions;
if (DEBUG.Enabled) Log.debug("each of " + regions + " quantile regions has an approx sample size of: " + range + " samples");
// TODO: the below median computation for ranges with an even # of buckets should
// be done for each range
for (int i = 0; i < quantiles.length; i++) {
final float rawIndex = (i+1) * range;
//final int index = Math.round(rawIndex);
final int index = (int) Math.floor(rawIndex); // using floor will exactly align middle index in odd numbered value sets
quantiles[i] = values[index];
if (DEBUG.Enabled) Log.debug(String.format("quantile %d index %3.2f (%d) value = " + values[index], i, rawIndex, index));
}
// If the number of buckets is even (and thus the # of quantile values needed is odd),
// the middle quantile will be the median.
final double median;
final int halfIndex = values.length / 2;
if (values.length % 2 == 0) {
// even # of sample values -- absolute median must be computed separately by averaging middle two values
final double belowMedian = values[halfIndex - 1];
final double aboveMedian = values[halfIndex];
median = (belowMedian + aboveMedian) / 2.0;
if (DEBUG.Enabled) Log.debug(String.format("AVERAGED MEDIAN: %g from %g+%g halfIndex=%d",
median, belowMedian, aboveMedian, halfIndex));
} else {
// odd # of sample values -- median already represented by the middle value
median = values[halfIndex];
if (DEBUG.Enabled) Log.debug(String.format("PICKED MEDIAN: %g from exact middle index=%d",
median, halfIndex));
//median = quantiles[(quantiles.length + 1) / 2 - 1];
}
if (EVEN_REGIONS) {
if (quantiles[quantiles.length / 2] != median) {
if (DEBUG.Enabled) Log.info(String.format("PATCHING MIDDLE QUANTILE TO ABSOLUTE MEDIAN; %g -> %g",
quantiles[quantiles.length / 2], median));
quantiles[quantiles.length / 2] = median;
}
}
return median;
}
private static void computeValueRangeQuantiles(final double[] quantiles, final double minValue, final double maxValue)
{
final double allValueRange = (maxValue - minValue);
final double quantileValueRange = allValueRange / (quantiles.length+1);
if (DEBUG.Enabled) Log.debug(String.format("computing value-based quantiles for values (%g-%g) range=%g, quantileRange=%g",
minValue, maxValue, allValueRange, quantileValueRange));
for (int i = 0; i < quantiles.length; i++) {
quantiles[i] = minValue + (quantileValueRange * (i+1));
if (DEBUG.Enabled) Log.debug(String.format("quantile %d value = %g", i, quantiles[i]));
}
}
private static final boolean USE_VALUE_RANGE_QUANTILES = true; // original Anoop method
private static final boolean USE_STANDARD_QUANTILES = false; // standard statistical method (resource intensive: duplicates & sorts entire sample set)
//private static final boolean USE_COMPRESSED_SAMPLE_QUANTILES = !USE_STANDARD_QUANTILES; // ignore repeated values in sample set
/** compute and record standard method quantile values as well as the median value */
private void computeQuantiles(final double[] allValues)
{
// NOTE: for data-sets with many repeated values, several of the quantiles may
// cover exactly the same range of values. Adding another type of analysis for
// that case would be useful, or perhaps rolling our own "modified quantile"
// analysis that forces quantiles to cover different values.
// E.g: if QUANTILE_BUCKETS=4 (we want 4 buckets), we need to produce 3 (three) quantile
// values to divide the range into 4 (four) regions
mQuantiles = new double[QUANTILE_BUCKETS - 1];
if (USE_STANDARD_QUANTILES) {
// this will fill mQuantiles with appropriate values
mMedianValue = computeQuantiles(mQuantiles, allValues);
} else if (USE_VALUE_RANGE_QUANTILES) {
// This method can produce more semantically meaningful quantiles, but this
// backfires and renders the quantiles mostly useless if there are outliers.
// E.g., a single high outlier can leave almost all values in the first
// bucket, nothing at all in the middle buckets, and the single high-flyer
// in the top bucket.
computeValueRangeQuantiles(mQuantiles, mMinValue, mMaxValue);
mMedianValue = Double.NaN; // uncomputed
} else {
// This is the STANDARD method except with "compressed" samples -- only
// unique values are analyized.
int validCount = mValues.elementSet().size();
if (mValues.contains(EMPTY_VALUE))
validCount--;
final double[] uniqueValues = new double[validCount];
int i = 0;
for (String s : mValues.elementSet())
if (s != EMPTY_VALUE)
uniqueValues[i++] = getNumericValue(s, true);
mMedianValue = computeQuantiles(mQuantiles, uniqueValues);
}
}
private static final boolean SKEW_QUANTILES_LOW = false; // anecdotally "more balanced" when skewing high
private static final boolean SKEW_QUANTILES_HIGH = !SKEW_QUANTILES_LOW;
/** @return the quantile the given value is determined to lie in. Will return values from 0 - (QUANTILE_BUCKETS-1) */
private int getQuantile(final double value) {
// note:
// using "value <= mQuantiles[i]" skews data to lower quantiles
// using "value < mQuantiles[i]" skews data to higher quantiles
if (SKEW_QUANTILES_LOW) {
for (int i = 0; i < mQuantiles.length; i++)
if (value <= mQuantiles[i])
return i;
} else {
for (int i = 0; i < mQuantiles.length; i++)
if (value < mQuantiles[i])
return i;
}
return mQuantiles.length;
}
private String getQuantileName(int i) {
// A 1.0 TOP_RANGE_ADJUSTMENT value on only works for integer ranges; won't work for
// sub-integer value ranges. This adjustment entirely depends on which way we skew in
// getQuantile.
// For non-integer values we just allow the quantile names to be ambiguously overlapping
// (e.g., allow the MAX of one range to equal the MIN of the next range).
final double TOP_RANGE_ADJUSTMENT;
if (mAllValuesAreIntegers && SKEW_QUANTILES_HIGH) // could still adjust low, but would need different adjustment
TOP_RANGE_ADJUSTMENT = 1.0;
else
TOP_RANGE_ADJUSTMENT = 0.0;
final double min, max;
if (i == 0)
min = mMinValue;
else
min = mQuantiles[i - 1];
if (i == mQuantiles.length)
max = mMaxValue;
else
max = mQuantiles[i] - TOP_RANGE_ADJUSTMENT;
if (mAllValuesAreIntegers)
return String.format("Q%d: %,.0f-%,.0f", i+1, min, max);
else
return String.format("Q%d: %,g-%,g", i+1, min, max);
}
//private static final String[] QUANTILE_NAMES = { "Lowest", "Low", "Medium", "High", "Highest" };
void performFinalAnalysis() {
mTypeDetermined = true;
if (!isNumeric() || uniqueValueCount() <= (QUANTILE_BUCKETS*3))
return;
if (isKeyField())
return;
//-----------------------------------------------------------------------------
// Compute common summary statistics & quantiles
//-----------------------------------------------------------------------------
mMeanValue = mValuesTotal / mValuesSeen;
// TODO: we could compute the quantile values in much less memory by using a
// sorted-by-value version of the existing mValues Multiset, and iterating through it by
// increasing "count" to find the appropriate median values.
// performance: if all values are integers/longs, we could optimize the following codepaths to
// use integer types & parsing code
final double[] allValues;
if (USE_STANDARD_QUANTILES)
allValues = new double[mValuesSeen];
else
allValues = null;
double totalSquaredDeviations = 0;
int count = 0;
for (DataRow row : schema.getRows()) {
final String text = row.getValue(this);
if (text == null) {
// this should only happen in XML data-sets with fields that don't have
// values in all rows
continue;
}
final double value = getNumericValue(text);
if (Double.isNaN(value))
continue;
if (USE_STANDARD_QUANTILES)
allValues[count] = value;
count++;
final double meanDeviation = value - mMeanValue;
totalSquaredDeviations += (meanDeviation * meanDeviation);
}
if (count != mValuesSeen) {
Log.warn(this + Util.TERM_RED + ": COUNT != mValuesSeen; " + count + " != " + mValuesSeen + Util.TERM_CLEAR);
return;
}
final double variance = totalSquaredDeviations / mValuesSeen;
mStandardDeviation = Math.sqrt(variance);
//-----------------------------------------------------------------------------
// Create quantiles
//-----------------------------------------------------------------------------
computeQuantiles(allValues);
//-----------------------------------------------------------------------------
// Explicitly create quantile value records (we do this first only so they are ordered)
//-----------------------------------------------------------------------------
final double range = mMaxValue - mMinValue;
//final String[] quantileNames = QUANTILE_NAMES.clone();
final String[] quantileNames = new String[QUANTILE_BUCKETS];
//final Field quantileField = this;
final Field quantileField =
schema.addFieldBefore(this,
String.format("%s [Q%d]", getName(), QUANTILE_BUCKETS));
quantileField.setType(TYPE_QUANTILE);
//quantileField.setStyleNode(getStyleNode()); // TODO: WON'T WORK: style-node not yet set
// duplicate v.s. crate new via data-action so we don't use up color schemes
quantileField.setStyleNode(DataAction.initNewStyleNode(getStyleNode().duplicate()));
//Util.printStackTrace("SETTING LABEL ON " + Util.tags(quantileField.getStyleNode() + " for " + this));
quantileField.getStyleNode().setLabelTemplate(String.format("%s Range\n${%s}", getName(), quantileField.getName()));
for (int i = 0; i < QUANTILE_BUCKETS; i++) {
quantileNames[i] = getQuantileName(i);
// We add the possible values now only to enforce the order in mValues for the DataTree
quantileField.mValues.add(quantileNames[i]);
//quantileField.trackValue(quantileNames[i]);
}
//-----------------------------------------------------------------------------
// Assign quantile values to all rows:
//-----------------------------------------------------------------------------
for (DataRow row : schema.getRows()) {
final String text = row.getValue(this);
if (text == null) {
// this should only happen in XML data-sets with fields that don't have
// values in all rows
continue;
}
final double value = getNumericValue(text, true);
final String quantileValue;
if (Double.isNaN(value)) {
quantileValue = Field.EMPTY_VALUE;
} else {
quantileValue = quantileNames[getQuantile(value)];
row.addValue(quantileField, quantileValue);
}
// Don't bother to add quartile values for empty values
//row.addValue(quantileField, quantileValue);
}
//-----------------------------------------------------------------------------
if (DEBUG.Enabled) {
//final double deviationQ = range / QUANTILE_BUCKETS;
//quantileField.trackValue(String.format("(DeviationQ: %.1f)", deviationQ));
// quantileField.mValues.add(String.format("(Std Dev: %.1f)", mStandardDeviation));
// quantileField.mValues.add(String.format("(Segments: %.1f)", range / mStandardDeviation));
}
final double deviationsToCoverAllValues = range / mStandardDeviation; // # of std-dev's needed to cover all values
if (mAllValuesAreIntegers) {
quantileField.addDataComment(String.format("Mean: %.1f", mMeanValue));
if (!Double.isNaN(mMedianValue))
quantileField.addDataComment(String.format("Median: %.1f", mMedianValue));
quantileField.addDataComment(String.format("Std Dev: %d x %.1f",
(int) Math.round(mStandardDeviation),
deviationsToCoverAllValues
//(int) Math.round(deviationsToCoverAllValues)
));
} else {
quantileField.addDataComment(String.format("Mean: %g", mMeanValue));
if (!Double.isNaN(mMedianValue))
quantileField.addDataComment(String.format("Median: %g", mMedianValue));
quantileField.addDataComment(String.format("Std Dev: %g x %.1f",
mStandardDeviation,
deviationsToCoverAllValues
));
}
}
public Collection<String> getDataComments() {
return mDataComments;
}
private void addDataComment(String s) {
mDataComments.add(s);
}
// This code appears to be calculating a quantile by calculating the linear % location the value
// has within the total range of possible values. We're now computing quantiles using a standard definition of
// quantile / quartile that involves computing by median.
// private int getQuantile(final double value) {
// return getQuantile(mMinValue, mMaxValue, value, QUANTILE_BUCKETS);
// }
// private static int getQuantile
// (final double min,
// final double max,
// final double value,
// final int N)
// {
// final double ratio = (value-min) / (max-min);
// final int quantile = (int) Math.ceil(ratio*N);
// if (quantile <= 0) {
// Log.warn("quantile="+quantile + " for value " + value);
// return 1;
// } else
// return quantile;
// }
// private static String getQuantileRange
// (final double min,
// final double max,
// final int quantile,
// final int N)
// {
// final double lowVal = min + (max-min)*(quantile-1)/N;
// final double highVal = min + (max-min)*(quantile)/N;
// return String.format("%.1f-%.1f", lowVal, highVal);
// }
private String sampleValues(boolean unique) {
if (count(mValues) <= 20)
return unique ? mValues.elementSet().toString() : mValues.toString();
final StringBuilder buf = new StringBuilder("[examples: ");
int count = 0;
for (String s : mValues.elementSet()) {
buf.append('"');
buf.append(s);
buf.append('"');
if (++count >= 3)
break;
buf.append(", ");
}
buf.append("]");
return buf.toString();
}
public String valuesDebug() {
if (mValues == null) {
if (mValuesSeen == 0)
return "(empty)";
else
return String.format("%5d values (un-tracked; max-len%6d)", mValuesSeen, mMaxValueLen);
}
else if (isSingleton()) {
return "singleton" + mValues.elementSet();
}
else if (mAllValuesUnique) {
if (count(mValues) > 1) {
return String.format("%5d unique, single-instance values; %s", count(mValues), sampleValues(true));
// String s = String.format("%2d unique, single-instance values", values.size());
// if (values.size() < 16)
// //return s + "; " + values.keySet();
// return s + "; " + values.toString();
// else
// return s + "; " + sampleValues();
}
else
return "<empty>?";
}
else
return String.format("%5d values, %4d unique: %s", valueCount(), count(mValues), sampleValues(false));
//return String.format("%5d unique values in %5d; %s", values.size(), valueCount(), sampleValues(false));
}
/** interface {@link XMLUnmarshalListener} -- does nothing here */
public void XML_fieldAdded(Object context, String name, Object child) {}
/** interface {@link XMLUnmarshalListener} -- does nothing here */
public void XML_addNotify(Object context, String name, Object parent) {}
}
// abstract class AbstractValue implements CharSequence {
// final String value;
// AbstractValue(String s) { value = s; }
// public int length() { return value.length(); }
// public char charAt(int index) { return value.charAt(index); }
// public CharSequence subSequence(int start, int end) { return value.subSequence(start, end); }
// public int compareTo(String anotherString) { return value.compareTo(anotherString); }
// }
// final class QValue extends AbstractValue {
// public final int quantile;
// QValue(String s, int qv) { super(s); quantile = qv; }
// }