package org.activityinfo.geoadmin; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.vividsolutions.jts.geom.Envelope; import org.activityinfo.geoadmin.model.AdminEntity; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Set; /** * Process for matching imported features to a set of existing administrative * entities. */ public class Joiner { /** * The minimum score for a match by name. Below this we just get garbage. */ private static final double MIN_NAME_MATCH = 0.75; /** * The threshold of a very sure match geographically. */ private static final double SURE_GEO_MATCH = 0.95; private List<AdminEntity> entities; private List<ImportFeature> features; private int nameAttributeIndex = -1; public Joiner(List<AdminEntity> entities, List<ImportFeature> features) { this.entities = entities; this.features = features; } /** * Joins the imported features to their probable parents among the list of * admin entities. * * @return a list of entities, where each item in the list is the best * matching parent for the corresponding item in the features list. */ public List<AdminEntity> joinParents() { List<AdminEntity> parents = Lists.newArrayList(); for (ImportFeature feature : features) { parents.add(findBestMatchingParentEntity(feature)); } return parents; } /** * Joins the imported features on a one-to-one basis with the list of admin * entities. */ public List<Join> joinOneToOne() { List<Join> joins = Lists.newArrayList(); Set<ImportFeature> unmatchedFeatures = Sets.newHashSet(features); Set<AdminEntity> unmatchedEntities = Sets.newHashSet(entities); // in the first pass, match strictly by name for (AdminEntity unit : entities) { ImportFeature match = matchExactlyByName(unit); if (match != null && unmatchedEntities.contains(match)) { joins.add(new Join(unit, match)); unmatchedEntities.remove(unit); unmatchedFeatures.remove(match); } } // now match remaining elements fuzzily for (AdminEntity entity : Lists.newArrayList(unmatchedEntities)) { ImportFeature match = findBestMatch(entity, unmatchedFeatures); if (match != null) { joins.add(new Join(entity, match)); unmatchedEntities.remove(entity); unmatchedFeatures.remove(match); } } // and finally and unmatched ones as loners for (AdminEntity entity : unmatchedEntities) { joins.add(new Join(entity, null)); } for (ImportFeature feature : unmatchedFeatures) { joins.add(new Join(null, feature)); } sortJoinsByAdminName(joins); return joins; } private void sortJoinsByAdminName(List<Join> joins) { Collections.sort(joins, new Comparator<Join>() { @Override public int compare(Join o1, Join o2) { String s1 = "ZZZZ"; String s2 = "ZZZZ"; if (o1.getEntity() != null) { s1 = o1.getEntity().getName(); } if (o2.getEntity() != null) { s2 = o2.getEntity().getName(); } return s1.compareTo(s2); } }); } /** * Match the admin entity to the closest element in the collection of * features * * @param entity * @param features * @return */ private ImportFeature findBestMatch(AdminEntity entity, Iterable<ImportFeature> features) { double bestScore = 0; ImportFeature bestFeature = null; for (ImportFeature feature : features) { double nameSimilarity = scoreName(entity, feature); double geoScore = scoreOverlap(entity, feature); // avoid totally spurious matches... if (nameSimilarity > MIN_NAME_MATCH || geoScore > SURE_GEO_MATCH) { double totalScore = nameSimilarity + geoScore; if (totalScore > bestScore) { bestScore = totalScore; bestFeature = feature; } } } return bestFeature; } /** * Matches exactly by the name, returning a value if EXACTLY one feature * matches the name, otherwise null. */ private ImportFeature matchExactlyByName(AdminEntity entity) { Set<ImportFeature> exactMatches = Sets.newHashSet(); for (ImportFeature feature : features) { if (feature.similarity(entity.getName()) == 1.0) { exactMatches.add(feature); } } if (exactMatches.size() == 1) { return exactMatches.iterator().next(); } else { return null; } } /** * Finds the admin entity that best matches the * given the feature. * * @param feature * @return */ private AdminEntity findBestMatchingParentEntity(ImportFeature feature) { double bestScore = 0; AdminEntity bestEntity = null; for (AdminEntity entity : entities) { double score = scorePotentialParent(entity, feature); if (score > bestScore) { bestScore = score; bestEntity = entity; } } return bestEntity; } public double scorePotentialParent(AdminEntity entity, ImportFeature feature) { // calculate the proportion of this feature that is contained by the // potential parent double geoScore = scoreContainment(feature, entity); // calculate the name overlap double nameSimilarity = scoreName(entity, feature); // give a bonus for perfect name match if (nameSimilarity == 1.0) { nameSimilarity += 0.25; } // bonus for being perfectly contained if(geoScore > 0.99) { geoScore += .50; } System.out.println(String.format("%s <> %s %.2f %.2f", entity.getName(), featureName(feature), geoScore, nameSimilarity)); return nameSimilarity + geoScore; } private String featureName(ImportFeature feature) { if (nameAttributeIndex >= 0) { Object featureName = feature.getAttributeValue(nameAttributeIndex); if (featureName instanceof String) { return (String)featureName; } } return null; } /** * Calculates the area of intersection of the MBR's of the admin entity and * the feature to import */ public static double areaOfIntersection(AdminEntity entity, ImportFeature feature) { if (entity.getBounds() == null) { return 0; } Envelope unitEnvelope = GeoUtils.toEnvelope(entity.getBounds()); Envelope featureEnvelope = feature.getEnvelope(); return unitEnvelope.intersection(featureEnvelope).getArea(); } public static double scoreOverlap(AdminEntity entity, ImportFeature feature) { if(entity.getBounds() == null) { return 0; } return areaOfIntersection(entity, feature) / GeoUtils.toEnvelope(entity.getBounds()).getArea(); } public static double scoreContainment(ImportFeature feature, AdminEntity entity) { return areaOfIntersection(entity, feature) / feature.getEnvelope().getArea(); } public double scoreName(AdminEntity entity, ImportFeature feature) { if (nameAttributeIndex >= 0) { Object featureName = feature.getAttributeValue(nameAttributeIndex); if (featureName instanceof String) { return PlaceNames.similarity((String) featureName, entity.getName()); } } return 0; } public void setNameAttributeIndex(int attributeIndex) { this.nameAttributeIndex = attributeIndex; } }