package org.opensextant.extractors.geo.rules; import java.util.HashMap; import java.util.List; import java.util.Map; import org.opensextant.data.Country; import org.opensextant.data.Place; import org.opensextant.extractors.geo.CountryCount; import org.opensextant.extractors.geo.PlaceCandidate; import org.opensextant.extractors.geo.PlaceCount; import org.opensextant.extractors.geo.PlaceEvidence; import org.opensextant.util.GeodeticUtility; /** * A final geocoding pass or two. Loop through candidates and choose * the location that best fits the context. * * As needed cache chosen entries to optimize, e.g. co-referrenced places * aformentioned in document. Ideally, consider choosing a best place for the * particular instance of a name, but percolate that to the other mentions of that same name. * Is it the same place? No need to disambiguate it multiple times at this point. * * @author ubaldino * */ public class LocationChooserRule extends GeocodeRule { /** * These are set. */ private Map<String, CountryCount> countryContext = null; private Map<String, PlaceCount> boundaryContext = null; private Map<String, PlaceCount> namespace = new HashMap<>(); private HashMap<String, CountryCount> inferredCountries = new HashMap<>(); /** * These are accumulated. */ private Map<String, Place> documentResolvedLocations = new HashMap<>(); private Map<String, PlaceCandidate> documentCandidates = new HashMap<>(); @Override public void reset() { documentResolvedLocations.clear(); documentCandidates.clear(); namespace.clear(); inferredCountries.clear(); } /** * Walk the entire list. */ public void evaluate(List<PlaceCandidate> names) { // INPUTS: // histogram of country mentions // resolved/relevant provinces (PlaceEvidence) // resolved/relevant locations attached to places (PlaceEvidence) // // MEASURES: // # of distinct countries == density, focus. Is this document about one or two countries, // or is it a world news report on everything. // countryContext = countryObserver.countryMentionCount(); boundaryContext = boundaryObserver.placeMentionCount(); /* TODO: DEBUG through location chooser using histograms * of found and resolved place metadata. * */ if (log.isDebugEnabled()) { debuggingHistograms(names); } for (PlaceCandidate name : names) { if (name.isFilteredOut() || name.isCountry) { continue; } if (name.getChosen() != null) { // documentResolvedLocations.put(name.getTextnorm(), name.getChosen()); // CACHE? // DONE continue; } // + For each Name, stack evidence for a given geo or a class of geo (evidence applies to multiple candidate geos) // + Assign a weight for each geo based on innate features and evidence. // + Sort by final score // + Choose top score // + Cache result for a given NAME = CHOSEN, so we don't repeat the same logic unnecessarily. // for (Place geo : name.getPlaces()) { evaluate(name, geo); } name.choose(); if (name.getChosen() != null) { this.assessConfidence(name); documentResolvedLocations.put(name.getTextnorm(), name.getChosen()); } else { log.info("Place name is ambiguous: {} in N={} places", name.getText(), name.distinctLocationCount()); } } } /** * What can we learn from assembling better stats at the document level? * Evidence breaks down into concrete locations vs. inferred. * * @param names */ private void debuggingHistograms(List<PlaceCandidate> names) { /* * TODO: Is this histogram helpful.? * * Uniqueness or popularity of a given name. */ for (PlaceCandidate name : names) { if (name.isFilteredOut()) { continue; } PlaceCount x = namespace.get(name.getTextnorm()); if (x == null) { x = new PlaceCount(); x.place = new Place(name.getTextnorm(), name.getTextnorm()); x.total = names.size(); namespace.put(name.getTextnorm(), x); } else { ++x.count; } } for (String cc : countryContext.keySet()) { CountryCount count = countryContext.get(cc); //log.debug("Country: {}/{} ({})", cc, count.country, count.count); log.debug("Country: {}", count); } for (PlaceCount count : boundaryContext.values()) { //log.debug("Boundary: {} ({})", count.place, count.count); log.debug("Boundary: {}", count); String cc = count.place.getCountryCode(); CountryCount Ccnt = inferredCountries.get(cc); if (Ccnt == null) { Ccnt = new CountryCount(); Ccnt.country = new Country(cc, cc); inferredCountries.put(cc, Ccnt); } else { ++Ccnt.count; } } log.debug("Places: {}/{}", namespace.size(), namespace); } protected static final double ADMIN_CONTAINS_PLACE_WT = 3.0; protected static final double COUNTRY_CONTAINS_PLACE_WT = 1.0; /** * An amount of points that would be distributed amongst feature types * at each level, e.g., Country names, ADM1, ADM2, PPL names. * * If you have 2 different countries, one mentioned 4 times and the other mentioned 10 times * you might say the latter is more relevant regarding any ambiguous geography. * With 14 mentions, that second country is weighted 10/14 = 0.71 of the GLOBAL_POINTS for disambiguation. * * Note, that if only one country appears in context, then it is very possible * that these global points will outweigh other over arching connections, such as rules for * CITY,STATE or MAJOR PLACE (POPULATION). That is okay -- if one single country is mentioned at all, * then that seems to be a big anchoring point for lots of ambiguities. */ private static final int GLOBAL_POINTS = 5; /** * Yet unchosen location. * Consider given evidence first, creating some weight there, * then introducing innate properties of possible locations, thereby amplifying the * differences in the candidates. * */ @Override public void evaluate(PlaceCandidate name, Place geo) { if (boundaryContext.isEmpty() && countryContext.isEmpty()) { return; } double countryScalar = 1.0; CountryCount ccnt = countryContext.get(geo.getCountryCode()); if (ccnt != null) { countryScalar = GLOBAL_POINTS * ccnt.getRatio(); } // Choose either boundary or country context to add in for this location. // This is inferred stuff from the document at large. if (geo.getHierarchicalPath() != null && boundaryContext.containsKey(geo.getHierarchicalPath())) { name.incrementPlaceScore(geo, countryScalar * ADMIN_CONTAINS_PLACE_WT); } else if (countryContext.containsKey(geo.getCountryCode())) { name.incrementPlaceScore(geo, countryScalar * COUNTRY_CONTAINS_PLACE_WT); } // Other local evidence. // for (PlaceEvidence ev : name.getEvidence()) { if (ev.wasEvaluated()) { continue; } ev.defaultHierarchicalPath(); // Evaluate evidence if ((ev.getAdmin1() != null && geo.getAdmin1() != null)) { if (geo.getHierarchicalPath().equals(ev.getHierarchicalPath())) { name.incrementPlaceScore(geo, ADMIN_CONTAINS_PLACE_WT); } } else { if (geo.getCountryCode().equals(ev.getCountryCode())) { name.incrementPlaceScore(geo, COUNTRY_CONTAINS_PLACE_WT); } } ev.setEvaluated(true); log.debug("\tEvidence: {} {}", ev, ev.getAdmin1()); } } /** * */ public static final int MATCHCONF_BARE_ACRONYM = 10; /** * The bare minimum confidence -- if rules negate confidence points, * confidence may go below 20. */ public static final int MATCHCONF_MINIMUM = 20; /** * Absolute Confidence: Many Locations matched a single name. * No country is in scope; No country mentioned in document, so this is very low confidence. */ public static final int MATCHCONF_MANY_LOC = MATCHCONF_MINIMUM; /** * Absolute Confidence: Many locations matched, with multiple countries in scope * So, Many countries mentioned in document */ public static final int MATCHCONF_MANY_COUNTRIES = 40; /** * Absolute Confidence: Many locations matched, but one country in scope. * So, 1 country mentioned in document */ public static final int MATCHCONF_MANY_COUNTRY = 50; /** * Absolute Confidence: Name, Region; City, State; Capital, Country; etc. * Patterns of qualified places. */ public static final int MATCHCONF_NAME_REGION = 60; /** * Absolute Confidence: Unique name in gazetteer. */ public static final int MATCHCONF_ONE_LOC = 80; /** Absolute Confidence: Geographic location of a named place lines up with a coordinate in-scope */ public static final int MATCHCONF_GEODETIC = 90; /** Confidence Qualifier: The chosen place happens to be a major place, e.g., large city. */ public static final int MATCHCONF_QUALIFIER_MAJOR_PLACE = 5; /** Confidence Qualifier: The chosen place happens to be in a country mentioned in the document */ public static final int MATCHCONF_QUALIFIER_COUNTRY_MENTIONED = 5; /** * Confidence Qualifier: Ambiguous */ public static final int MATCHCONF_QUALIFIER_AMBIGUOUS_NAME = -5; /** * Confidence Qualifier: Name appears in only one country. * */ public static final int MATCHCONF_QUALIFIER_UNIQUE_COUNTRY = 8; /** Confidence Qualifier: The chosen place scored high compared to the runner up */ public static final int MATCHCONF_QUALIFIER_HIGH_SCORE = 5; /** * Confidence Qualifier: Start here if you have a lower case term that may be a place. * -20 points or more for lower case matches, however feat_class P and A win back 5 points; others are * less likely places. */ public static final int MATCHCONF_QUALIFIER_LOWERCASE = -15; private static boolean isShort(int matchLen) { return matchLen <= NonsenseFilter.GENERIC_ONE_WORD; } /** * Confidence of your final chosen location for a given name is assembled as the sum of some absolute metric * plus some additional qualifiers. The absolute provides some context at the document level, whereas the * qualifiers are refinements. * * <pre> * conf = A + Q1 + Q2... // this may change. * </pre> * * @param pc */ public void assessConfidence(PlaceCandidate pc) { if (pc.getChosen() == null && pc.distinctLocationCount() > 0) { // Either not evaluated yet or no good choice could be made. // Ambiguous location name. pc.setConfidence(MATCHCONF_MANY_LOC); return; } int points = 0; // This place candidate instance: // - total # of instances in gazetteer, e.g., getPlaces() // - distinct countries for those places, e.g., // // Mutually Exclusive conditions: //====================== if (pc.hasRule(CoordinateAssociationRule.COORD_PROXIMITY_RULE)) { points = MATCHCONF_GEODETIC; } else if (pc.distinctLocationCount() == 1 && countryObserver.countryCount() > 0) { points = MATCHCONF_ONE_LOC; } else if (countryObserver.countryCount() == 0 && pc.hasDiacritics && isShort(pc.getLength())) { points = MATCHCONF_MINIMUM; } else if (pc.hasRule(NameCodeRule.NAME_ADMCODE_RULE) || pc.hasRule(NameCodeRule.NAME_ADMNAME_RULE)) { points = MATCHCONF_NAME_REGION; } else if (countryObserver.countryCount() == 1) { points = MATCHCONF_MANY_COUNTRY; } else if (pc.getEvidence().isEmpty()) { points = assessLowConfidence(pc); } else if (countryObserver.countryCount() > 0) { points = MATCHCONF_MANY_COUNTRIES; } else { points = MATCHCONF_MANY_LOC; } // Any of these may occur. //====================== // // Lower case? Eh... language dependent. // If you have mixed case documents, then lower case matches // immediately get low-confidence. if (pc.isLower()) { points += MATCHCONF_QUALIFIER_LOWERCASE; if (pc.getChosen().isAdministrative()) { points += 10; } else if (pc.getChosen().isPopulated()) { points += 5; } } // TODO: work through ambiguities -- true ties. // AMBIGUOUS TIE: if (pc.isAmbiguous()) { Place p1 = pc.getChosen(); Place p2 = pc.getSecondChoice(); if (GeodeticUtility.distanceMeters(p1, p2) < SAME_LOCALITY_RADIUS) { points += 6; } else if (p1.isSame(p2)) { points += 4; } else if (sameBoundary(p1, p2)) { points += 3; } else if (sameCountry(p1, p2)) { points += 2; } else { points += MATCHCONF_QUALIFIER_AMBIGUOUS_NAME; } } else if (pc.getSecondChoiceScore() > 0) { // NOT AMBIGUOUS, but is first score much higher than all others? // That makes first choice more confident, especially in low-evidence situations. double a = pc.getChosen().getScore(); double b = pc.getSecondChoiceScore(); double scoreRatio = a / b; // Top score = 40, second score = 25 if (scoreRatio > 1.2) { // 20% better points += MATCHCONF_QUALIFIER_HIGH_SCORE; } } if (pc.distinctCountryCount() == 1) { points += MATCHCONF_QUALIFIER_UNIQUE_COUNTRY; } // Is Major place? Account for major place population separate from its designation. if (pc.hasRule(MajorPlaceRule.POP)) { points += MATCHCONF_QUALIFIER_MAJOR_PLACE; } if (pc.hasRule(MajorPlaceRule.ADMIN) || pc.hasRule(MajorPlaceRule.CAPITAL)) { points += MATCHCONF_QUALIFIER_MAJOR_PLACE; } // if (this.countryObserver.countryObserved(pc.getChosen().getCountryCode())) { points += MATCHCONF_QUALIFIER_COUNTRY_MENTIONED; } pc.setConfidence(points); } private static final int SAME_LOCALITY_RADIUS = 10000; /* Meters */ private int assessLowConfidence(PlaceCandidate pc) { /* * False positive tuning -- working with something that has only a default score. * Acronyms, No Evidence, default score. All pretty much the same amount of confidence. * * <pre> * TEXT GEO MATCHED * ---- ---------- * ABS Abs low confidence. Acronym is intended. Mismatch. If No other evidence, really low confidence * Abs Abs good match * Abs ABS not bad. Id. matches ID for Idaho, for example. * Abs. Abs. good match. Abbreviation matched abbreviation. TODO. * Abs. ABS good match. Abbreviation matched abbreviation or code. TODO. * </pre> */ //boolean noEvidence = pc.getEvidence().isEmpty(); boolean isAcronym = pc.isUpper(); boolean isMisMatchedAcronym = (pc.isUpper() && !pc.getChosen().isUppercaseName()) || (!pc.isUpper() && pc.getChosen().isUppercaseName()); int points = MATCHCONF_MINIMUM; if (pc.hasDefaultRuleOnly() && isMisMatchedAcronym) { points = MATCHCONF_BARE_ACRONYM; } else if (isAcronym) { // Acronym with some evidence. points = MATCHCONF_BARE_ACRONYM + 3; } return points; } }