package org.activityinfo.geoadmin;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.activityinfo.geoadmin.model.AdminEntity;
import com.google.common.base.Predicate;
import com.google.common.base.Predicates;
import com.google.common.base.Strings;
import com.google.common.collect.Sets;
/**
* Finds a column within an import source that matches certain criteria. For
* example, if we want to preselect the column containing the entity name, we
* will look for a column containing mostly names [A-Za-z ]+ with few
* duplicates.
*/
public class ColumnGuesser {
private Predicate<Object> predicate = Predicates.alwaysTrue();
private boolean favorUniqueValues;
/**
* Creates a guesser for a
*
* @param pattern
* @return
*/
public ColumnGuesser forPattern(String pattern) {
final Pattern regex = Pattern.compile(pattern);
this.predicate = Predicates.and(predicate, new Predicate<Object>() {
public boolean apply(Object value) {
if(value == null) {
return false;
} else {
return regex.matcher(value.toString()).matches();
}
}
});
return this;
}
public ColumnGuesser forEntities(List<AdminEntity> entities) {
final Set<String> expected = Sets.newHashSet();
for(AdminEntity entity : entities) {
expected.add(PlaceNames.cleanName(entity.getName()));
}
this.predicate = Predicates.and(predicate, new Predicate<Object>() {
public boolean apply(Object value) {
if(value == null) {
return false;
} else {
String stringValue = PlaceNames.cleanName(value.toString());
return expected.contains(stringValue);
}
}
});
return this;
}
public ColumnGuesser favoringUniqueValues() {
favorUniqueValues = true;
return this;
}
/**
* Finds the index of the column/attribute in the import source that best
* matches the given criteria
*/
public int findBest(ImportSource source) {
int bestAttribute = -1;
double bestScore = 0;
for (int attributeIndex = 0; attributeIndex != source.getAttributeCount(); ++attributeIndex) {
double score = scoreColumn(source, attributeIndex);
if (score > bestScore) {
bestScore = score;
bestAttribute = attributeIndex;
}
}
return bestAttribute;
}
/**
* Scores a column based on the provided criteria. 0=poor match, high=good
* match
*/
private double scoreColumn(ImportSource source, int attributeIndex) {
double score = 0;
score += scorePredicate(source, attributeIndex);
if(favorUniqueValues) {
score += scoreUnique(source, attributeIndex);
}
return score;
}
/**
* Scores a given column/attribute based on the provided regex.
*
* @param source
* the import source
* @param attributeIndex
* the index of the attribute in the ImportScore to evaluate
* @return the proportion of values in the column that match the regex.
* (0=poor match,1=perfect match)
*/
private double scorePredicate(ImportSource source, int attributeIndex) {
int numMatching = 0;
for (ImportFeature feature : source.getFeatures()) {
Object value = feature.getAttributeValue(attributeIndex);
if (predicate.apply(value)) {
numMatching++;
}
}
return ratio(numMatching, source.getFeatures().size());
}
/**
* Scores the column on the uniqueness of its values, from 1=all values are
* unique.
*/
private double scoreUnique(ImportSource source, int attributeIndex) {
Set<String> values = Sets.newHashSet();
for (ImportFeature feature : source.getFeatures()) {
String value = feature.getAttributeStringValue(attributeIndex);
if (!Strings.isNullOrEmpty(value)) {
values.add(value);
}
}
return ratio(values.size(), source.getFeatureCount());
}
private double ratio(double numerator, double denominator) {
return numerator / denominator;
}
}