/*##################################################################### * * CLAVIN (Cartographic Location And Vicinity INdexer) * --------------------------------------------------- * * Copyright (C) 2012-2013 Berico Technologies * http://clavin.bericotechnologies.com * * ==================================================================== * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. * * ==================================================================== * * MultipartLocationResolver.java * *###################################################################*/ package com.bericotech.clavin.resolver.multipart; import com.bericotech.clavin.ClavinException; import com.bericotech.clavin.gazetteer.CountryCode; import com.bericotech.clavin.gazetteer.query.AncestryMode; import com.bericotech.clavin.gazetteer.query.FuzzyMode; import com.bericotech.clavin.gazetteer.query.Gazetteer; import com.bericotech.clavin.gazetteer.GeoName; import com.bericotech.clavin.gazetteer.query.QueryBuilder; import com.bericotech.clavin.resolver.ResolvedLocation; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.Deque; import java.util.EnumSet; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Resolves multipart location names from structured data into GeoName objects. * * Takes multipart location names, such as what's often found in structured data * like a spreadsheet or database table (e.g., [Reston][Virginia][United States]), * and resolves them into the appropriate geographic entities by identifying the * most logical match in a gazetteer, trying to enforce some kind of notional * hierarchy of place names (e.g., city --> state/province/etc. --> country). */ public class MultipartLocationResolver { /** * The logger. */ private final static Logger LOG = LoggerFactory.getLogger(MultipartLocationResolver.class); /** * The hit depth used during searches. */ private static final int MAX_RESULTS = 200; /** * The gazetteer for searches. */ private final Gazetteer gazetteer; /** * The scorer for multi-value searches. */ private final Scorer scorer; public MultipartLocationResolver(final Gazetteer gaz) { this.gazetteer = gaz; scorer = new DefaultScorer(); } /** * Resolves a multipart location name, such as what's often found * in structured data like a spreadsheet or database table (e.g., * [Reston][Virginia][United States]), into a {@link ResolvedMultipartLocation} * containing {@link com.bericotech.clavin.gazetteer.GeoName} objects. * * @param location multipart location name to be resolved * @param fuzzy switch for turning on/off fuzzy matching * @return resolved multipart location name * @throws ClavinException if an error occurs while resolving locations */ public ResolvedMultipartLocation resolveMultipartLocation(MultipartLocationName location, boolean fuzzy) throws ClavinException { // find all component locations in the gazetteer QueryBuilder queryBuilder = new QueryBuilder() // translate CLAVIN 1.x 'fuzzy' parameter into NO_EXACT or OFF; it isn't // necessary, or desirable to support FILL for the multi-part resolution algorithm .fuzzyMode(fuzzy ? FuzzyMode.NO_EXACT : FuzzyMode.OFF) .includeHistorical(true) .ancestryMode(AncestryMode.ON_CREATE) .maxResults(MAX_RESULTS); // country query should only include country-like feature codes queryBuilder.location(location.getCountry()).addCountryCodes(); List<ResolvedLocation> countries = gazetteer.getClosestLocations(queryBuilder.build()); // remove all "countries" that are not considered top-level administrative divisions; this // filters out territories that do not contain descendant GeoNames Iterator<ResolvedLocation> iter = countries.iterator(); while (iter.hasNext()) { if (!iter.next().getGeoname().isTopLevelAdminDivision()) { iter.remove(); } } Set<CountryCode> foundCountries = EnumSet.noneOf(CountryCode.class); // state query should only include admin-level feature codes with ancestors // in the list of located countries queryBuilder.location(location.getState()).clearFeatureCodes().addAdminCodes(); for (ResolvedLocation country : countries) { queryBuilder.addParentIds(country.getGeoname().getGeonameID()); foundCountries.add(country.getGeoname().getPrimaryCountryCode()); } List<ResolvedLocation> states = gazetteer.getClosestLocations(queryBuilder.build()); // city query should only include city-level feature codes; ancestry is restricted // to the discovered states or, if no states were found, the discovered countries or, // if neither states nor countries were found, no ancestry restrictions are added and // the most populated city will be selected queryBuilder.location(location.getCity()).clearFeatureCodes().addCityCodes(); if (!states.isEmpty()) { Set<CountryCode> stateCodes = EnumSet.noneOf(CountryCode.class); // only clear the parent ID restrictions if states were found; otherwise // we will continue our search based on the existing country restrictions, if any queryBuilder.clearParentIds(); for (ResolvedLocation state : states) { // only include the first administrative division found for each target // country if (!stateCodes.contains(state.getGeoname().getPrimaryCountryCode())) { queryBuilder.addParentIds(state.getGeoname().getGeonameID()); stateCodes.add(state.getGeoname().getPrimaryCountryCode()); } // since we are only including one "state" per country, short-circuit // the loop if we have added one for each unique country code returned // by the countries search if (!foundCountries.isEmpty() && foundCountries.equals(stateCodes)) { break; } } } List<ResolvedLocation> cities = gazetteer.getClosestLocations(queryBuilder.build()); // initialize return objects components ResolvedLocation finalCity = null; ResolvedLocation finalState = null; ResolvedLocation finalCountry = null; // assume the most populous valid city is the correct one return // note: this should be a reasonably safe assumption since we've attempted to enforce the // notional hierarchy of given place names (e.g., city --> state/province/etc. --> country) // and have therefore weeded out all other matches that don't fit this hierarchy if (!cities.isEmpty()) { finalCity = cities.get(0); } if (!states.isEmpty()) { // if we couldn't find a valid city, just take the most populous valid state/province/etc. if (finalCity == null) { finalState = states.get(0); } else { for (ResolvedLocation state : states) { // select the first state that is an ancestor of the selected city if (finalCity.getGeoname().isDescendantOf(state.getGeoname())) { finalState = state; break; } } } } if (!countries.isEmpty()) { // use the selected city if available and the selected state if not to identify the selected country ResolvedLocation best = finalCity != null ? finalCity : finalState; // if neither city nor state was resolved, take the most populous valid country if (best == null) { finalCountry = countries.get(0); } else { for (ResolvedLocation country : countries) { // select the first country that is an ancestor of the selected city or state if (best.getGeoname().isDescendantOf(country.getGeoname())) { finalCountry = country; break; } } } } return new ResolvedMultipartLocation(finalCity, finalState, finalCountry); } /** * Attempts to resolve a location provided as a comma-separated string of political divisions from * narrowest to broadest. The gazetteer current supports ancestry from the country level through four * administrative divisions so any more-specific divisions will be ignored once a city (lowest available * level of resolution) is found. Results will only be returned if all unignored location components are * matched. * @param loc the comma-separated location name (e.g. "City, County, State, Country") * @param fuzzy <code>true</code> to use fuzzy matching if an exact match for any location could not be found * @return the resolved location * @throws ClavinException if an error occurs while searching */ public ResolvedLocation resolveLocation(final String loc, final boolean fuzzy) throws ClavinException { return resolveLocation(fuzzy, loc.split(",")); } /** * Resolves a location provided as a series of political divisions from narrowest to broadest. The gazetteer * current supports ancestry from the country level through four administrative divisions so any more-specific * divisions will be ignored once a city (lowest available level of resolution) is found. Results will only * be returned if all unignored location components are matched. * @param fuzzy <code>true</code> to use fuzzy matching if an exact match for any location could not be found * @param locationParts the names of the locations to match, ordered from most to least specific * (e.g. [ "City", "County", "State", "Country" ]) * @return the resolved location * @throws ClavinException if an error occurs while searching */ @SuppressWarnings("unchecked") public ResolvedLocation resolveLocation(final boolean fuzzy, final String... locationParts) throws ClavinException { final List<String> terms = new ArrayList<String>(locationParts.length+1); // terms will be a list of broadest to narrowest; e.g. United States, Virginia, Fairfax County, Reston for (String part : locationParts) { if (part != null && !part.trim().equals("")) { terms.add(0, part); } } // short circuit if no input was provided if (terms.isEmpty()) { return null; } Set<MatchedLocation> candidates = new HashSet<MatchedLocation>(); Deque<SearchResult> matches = new LinkedList<SearchResult>(); QueryBuilder query = new QueryBuilder() .maxResults(MAX_RESULTS) // translate CLAVIN 1.x 'fuzzy' parameter into NO_EXACT or OFF; it isn't // necessary, or desirable to support FILL for the multi-part resolution algorithm .fuzzyMode(fuzzy ? FuzzyMode.NO_EXACT : FuzzyMode.OFF) .ancestryMode(AncestryMode.ON_CREATE) .includeHistorical(true); findCandidates(candidates, terms, SearchLevel.COUNTRY, matches, query); // Using post-processing sort instead of SortedSet implementation (TreeSet) because // TreeSet uses compareTo instead of equals/hashCode to eliminate duplicates and // incorrectly excludes elements that evaluate to the same sort score List<MatchedLocation> candidateList = new ArrayList<MatchedLocation>(candidates); Collections.sort(candidateList, new Comparator<MatchedLocation>() { @Override public int compare(final MatchedLocation loc1, final MatchedLocation loc2) { double score1 = scorer.score(terms, loc1); double score2 = scorer.score(terms, loc2); // sort candidates in descending order by score return Double.compare(score2, score1); } }); if (LOG.isDebugEnabled()) { LOG.debug("Found {} candidates", candidateList.size()); for (MatchedLocation candidate : candidateList) { LOG.debug(String.format("[%.3f] %s", scorer.score(terms, candidate), candidate.toString())); } } MatchedLocation bestMatch = candidateList.isEmpty() ? null : candidateList.get(0); ResolvedLocation location = null; if (bestMatch != null && (bestMatch.isFullySpecified() || bestMatch.getMatchCount() == terms.size())) { location = bestMatch.getMostSpecificMatch().getLocation(); } return location; } @SuppressWarnings("unchecked") private void findCandidates(final Set<MatchedLocation> candidates, final List<String> terms, final SearchLevel level, final Deque<SearchResult> matches, final QueryBuilder query) throws ClavinException { // if there are no more terms or level is null, add a candidate to the list // if there are any prior matches if (terms.isEmpty() || level == null) { if (!matches.isEmpty()) { if (LOG.isDebugEnabled()) { LOG.debug("Adding candidate for matches:"); for (SearchResult res : matches) { LOG.debug(res.toString()); } } candidates.add(new MatchedLocation(matches)); } return; } String term = terms.get(0); List<String> nextTerms = terms.size() > 1 ? terms.subList(1, terms.size()) : Collections.EMPTY_LIST; SearchResult lastMatch = matches.peek(); level.apply(query).location(term).clearParentIds(); if (lastMatch != null) { query.parentIds(lastMatch.parentIds); } List<ResolvedLocation> results = gazetteer.getClosestLocations(query.build()); // no results for this term at this level; search for this term at the // next level, then search for subsequent terms at this level if (results.isEmpty()) { findCandidates(candidates, terms, level.narrow(), matches, query); findCandidates(candidates, nextTerms, level, matches, query); } else { // we found results, process them to configure the filters for the next // level of the search and add them to the matches stack Set<Integer> parentIds = new HashSet<Integer>(); Set<String> parentCodes = new HashSet<String>(); Set<String> foundParents = new HashSet<String>(); // only include the first (best) result for each distinct parent in the filter set for (ResolvedLocation loc : results) { GeoName geo = loc.getGeoname(); String pCode = lastMatch != null ? lastMatch.level.getCode(geo) : null; // if there were no parent filters or we have not found a child for this parent // code, add this location to the filter set if (lastMatch == null || !foundParents.contains(pCode)) { parentIds.add(geo.getGeonameID()); parentCodes.add(level.getCode(geo)); foundParents.add(pCode); } // if there was a previous filter set, short-circuit once we have // a child from each parent if (lastMatch != null && foundParents.equals(lastMatch.parentCodes)) { break; } } matches.push(new SearchResult(level, results, parentIds, parentCodes)); // continue search for additional terms after adding these results to the // match stack findCandidates(candidates, nextTerms, level.narrow(), matches, query); // pop this match off the stack, then search for this term at the next level matches.pop(); findCandidates(candidates, terms, level.narrow(), matches, query); } } }