ColumnGuesser.java example

Explorer
activityinfo-master
package org.activityinfo.geoadmin;

import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;

import org.activityinfo.geoadmin.model.AdminEntity;

import com.google.common.base.Predicate;
import com.google.common.base.Predicates;
import com.google.common.base.Strings;
import com.google.common.collect.Sets;

/**
 * Finds a column within an import source that matches certain criteria. For
 * example, if we want to preselect the column containing the entity name, we
 * will look for a column containing mostly names [A-Za-z ]+ with few
 * duplicates.
 */
public class ColumnGuesser {
    private Predicate<Object> predicate = Predicates.alwaysTrue();
    private boolean favorUniqueValues;

    /**
     * Creates a guesser for a
     * 
     * @param pattern
     * @return
     */
    public ColumnGuesser forPattern(String pattern) {
    	final Pattern regex = Pattern.compile(pattern);
        this.predicate = Predicates.and(predicate, new Predicate<Object>() {
        	public boolean apply(Object value) {
        		if(value == null) {
        			return false;
        		} else {
        			return regex.matcher(value.toString()).matches();
        		}
        	}
		});
        return this;
    }
    

	public ColumnGuesser forEntities(List<AdminEntity> entities) {
		final Set<String> expected = Sets.newHashSet();
		for(AdminEntity entity : entities) {
			expected.add(PlaceNames.cleanName(entity.getName()));
		}
		this.predicate = Predicates.and(predicate, new Predicate<Object>() {
			public boolean apply(Object value) {
				if(value == null) {
					return false;
				} else {
					String stringValue = PlaceNames.cleanName(value.toString());
					return expected.contains(stringValue);
				}
			}
		});
		return this;
	}

    public ColumnGuesser favoringUniqueValues() {
    	favorUniqueValues = true;
        return this;
    }

    /**
     * Finds the index of the column/attribute in the import source that best
     * matches the given criteria
     */
    public int findBest(ImportSource source) {
        int bestAttribute = -1;
        double bestScore = 0;

        for (int attributeIndex = 0; attributeIndex != source.getAttributeCount(); ++attributeIndex) {
            double score = scoreColumn(source, attributeIndex);
            if (score > bestScore) {
                bestScore = score;
                bestAttribute = attributeIndex;
            }
        }
        return bestAttribute;
    }

    /**
     * Scores a column based on the provided criteria. 0=poor match, high=good
     * match
     */
    private double scoreColumn(ImportSource source, int attributeIndex) {
        double score = 0;
        
        score += scorePredicate(source, attributeIndex);

        if(favorUniqueValues) {
        	score += scoreUnique(source, attributeIndex);
        }
        return score;
    }

	/**
     * Scores a given column/attribute based on the provided regex.
     * 
     * @param source
     *            the import source
     * @param attributeIndex
     *            the index of the attribute in the ImportScore to evaluate
     * @return the proportion of values in the column that match the regex.
     *         (0=poor match,1=perfect match)
     */
    private double scorePredicate(ImportSource source, int attributeIndex) {
        int numMatching = 0;

        for (ImportFeature feature : source.getFeatures()) {
            Object value = feature.getAttributeValue(attributeIndex);
            if (predicate.apply(value)) {
                numMatching++;
            }
        }
        return ratio(numMatching, source.getFeatures().size());
    }

    /**
     * Scores the column on the uniqueness of its values, from 1=all values are
     * unique.
     */
    private double scoreUnique(ImportSource source, int attributeIndex) {
        Set<String> values = Sets.newHashSet();

        for (ImportFeature feature : source.getFeatures()) {
            String value = feature.getAttributeStringValue(attributeIndex);
            if (!Strings.isNullOrEmpty(value)) {
                values.add(value);
            }
        }

        return ratio(values.size(), source.getFeatureCount());
    }

    private double ratio(double numerator, double denominator) {
        return numerator / denominator;
    }

}