/*
* Copyright (c) 2017 OBiBa. All rights reserved.
*
* This program and the accompanying materials
* are made available under the terms of the GNU Public License v3.0.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.obiba.magma.math.summary;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import javax.validation.constraints.NotNull;
import org.obiba.magma.Value;
import org.obiba.magma.ValueSource;
import org.obiba.magma.ValueTable;
import org.obiba.magma.Variable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterators;
/**
*
*/
public class TextVariableSummary extends AbstractVariableSummary implements Serializable {
private static final long serialVersionUID = 203198842420473154L;
private static final Logger log = LoggerFactory.getLogger(TextVariableSummary.class);
public static final String NULL_NAME = "N/A";
public static final String NOT_NULL_NAME = "NOT_NULL";
private final org.apache.commons.math3.stat.Frequency frequencyDist = new org.apache.commons.math3.stat.Frequency();
private long n;
private boolean empty = true;
private final List<Frequency> frequencies = new ArrayList<>();
private TextVariableSummary(@NotNull Variable variable) {
super(variable);
}
@Override
public String getCacheKey(ValueTable table) {
return TextVariableSummaryFactory.getCacheKey(variable, table, getOffset(), getLimit());
}
@NotNull
public Iterable<Frequency> getFrequencies() {
return ImmutableList.copyOf(frequencies);
}
public long getN() {
return n;
}
public boolean isEmpty() {
return empty;
}
public static class Frequency implements Serializable {
private static final long serialVersionUID = -2876592652764310324L;
private final String value;
private final long freq;
private final double pct;
private final boolean missing;
public Frequency(String value, long freq, double pct, boolean missing) {
this.value = value;
this.freq = freq;
this.pct = pct;
this.missing = missing;
}
public String getValue() {
return value;
}
public long getFreq() {
return freq;
}
public double getPct() {
return pct;
}
public boolean isMissing() {
return missing;
}
}
@SuppressWarnings("ParameterHidesMemberVariable")
public static class Builder implements VariableSummaryBuilder<TextVariableSummary, Builder> {
private final TextVariableSummary summary;
@NotNull
private final Variable variable;
private boolean addedTable;
private boolean addedValue;
public Builder(@NotNull Variable variable) {
this.variable = variable;
summary = new TextVariableSummary(variable);
}
@Override
public Builder addValue(@NotNull Value value) {
if(addedTable) {
throw new IllegalStateException("Cannot add value for variable " + summary.variable.getName() +
" because values where previously added from the whole table with addTable().");
}
add(value);
addedValue = true;
return this;
}
@Override
public Builder addTable(@NotNull ValueTable table, @NotNull ValueSource valueSource) {
if(addedValue) {
throw new IllegalStateException("Cannot add table for variable " + summary.variable.getName() +
" because values where previously added with addValue().");
}
add(table, valueSource);
addedTable = true;
return this;
}
private void add(@NotNull ValueTable table, @NotNull ValueSource variableValueSource) {
//noinspection ConstantConditions
Preconditions.checkArgument(table != null, "table cannot be null");
//noinspection ConstantConditions
Preconditions.checkArgument(variableValueSource != null, "variableValueSource cannot be null");
if(!variableValueSource.supportVectorSource()) return;
for(Value value : variableValueSource.asVectorSource().getValues(summary.getFilteredVariableEntities(table))) {
add(value);
}
}
private void add(@NotNull Value value) {
//noinspection ConstantConditions
Preconditions.checkArgument(value != null, "value cannot be null");
if(summary.empty) summary.empty = false;
if(value.isSequence()) {
if(value.isNull()) {
summary.frequencyDist.addValue(NULL_NAME);
} else {
for(Value v : value.asSequence().getValue()) {
add(v);
}
}
} else {
summary.frequencyDist.addValue(value.isNull() ? NULL_NAME : value.toString());
}
}
/**
* Returns an iterator of frequencyDist names
*/
private Iterator<String> freqNames(org.apache.commons.math3.stat.Frequency freq) {
return Iterators.transform(freq.valuesIterator(), new Function<Comparable<?>, String>() {
@Override
public String apply(Comparable<?> input) {
return input.toString();
}
});
}
private void compute() {
log.trace("Start compute default summary {}", summary.variable);
Iterator<String> concat = freqNames(summary.frequencyDist);
// Iterate over all category names including or not distinct values.
// The loop will also determine the mode of the distribution (most frequent value)
while(concat.hasNext()) {
String value = concat.next();
summary.frequencies.add(new Frequency(value, summary.frequencyDist.getCount(value),
Double.isNaN(summary.frequencyDist.getPct(value)) ? 0.0 : summary.frequencyDist.getPct(value),
value.equals(NULL_NAME)));
}
Collections.sort(summary.frequencies, new Comparator<Frequency>() {
@Override
public int compare(Frequency o1, Frequency o2) {
return (int) (o2.getFreq() - o1.getFreq());
}
});
summary.n = summary.frequencyDist.getSumFreq();
}
public Builder filter(Integer offset, Integer limit) {
summary.setOffset(offset);
summary.setLimit(limit);
return this;
}
@Override
@NotNull
public TextVariableSummary build() {
compute();
return summary;
}
@NotNull
@Override
public Variable getVariable() {
return variable;
}
}
}