package com.bericotech.clavin.resolver; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import com.bericotech.clavin.ClavinException; import com.bericotech.clavin.extractor.LocationOccurrence; import com.bericotech.clavin.gazetteer.query.LuceneGazetteer; import java.io.File; import java.util.List; import org.junit.Before; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /*##################################################################### * * CLAVIN (Cartographic Location And Vicinity INdexer) * --------------------------------------------------- * * Copyright (C) 2012-2013 Berico Technologies * http://clavin.bericotechnologies.com * * ==================================================================== * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. * * ==================================================================== * * LuceneLocationResolverHeuristicsTest.java * *###################################################################*/ /** * Tests the mapping of location names into * {@link ResolvedLocation} objects as performed by * {@link ClavinLocationResolver#resolveLocations(List, boolean)}. */ public class ClavinLocationResolverHeuristicsTest { public final static Logger logger = LoggerFactory.getLogger(ClavinLocationResolverHeuristicsTest.class); // expected geonameID numbers for given location names private static final int BOSTON_MA = 4930956; private static final int HAVERHILL_MA = 4939085; private static final int WORCESTER_MA = 4956184; private static final int SPRINGFIELD_MA = 4951788; private static final int CHICAGO_IL = 4887398; private static final int ROCKFORD_IL = 4907959; private static final int SPRINGFIELD_IL = 4250542; private static final int DECATUR_IL = 4236895; private static final int KANSAS_CITY_MO = 4393217; private static final int SPRINGFIELD_MO = 4409896; private static final int ST_LOUIS_MO = 4407066; private static final int INDEPENDENCE_MO = 4391812; private static final int LONDON_UK = 2643743; private static final int MANCHESTER_UK = 2643123; private static final int HAVERHILL_UK = 2647310; private static final int TORONTO_ON = 6167865; private static final int OTTAWA_ON = 6094817; private static final int HAMILTON_ON = 5969782; private static final int KITCHENER_ON = 5992996; private static final int LONDON_ON = 6058560; private static final int CAIRO_EG = 360630; private static final int BENGHAZI_LY = 88319; private static final int VIRGINIA_US = 6254928; private static final int WASHINGTON_DC = 4140963; private static final int MARYLAND_US = 4361885; private static final int SEATTLE_WA = 5809844; private static final int WASHINGTON_STATE_US = 5815135; private static final int TACOMA_WA = 5812944; private static final int NO_HEURISTICS_MAX_HIT_DEPTH = 1; private static final int NO_HEURISTICS_MAX_CONTEXT_WINDOW = 1; private static final int HEURISTICS_MAX_HIT_DEPTH = 5; private static final int HEURISTICS_MAX_CONTEXT_WINDOW = 5; private ClavinLocationResolver resolver; private List<ResolvedLocation> resolvedLocations; /** * Instantiate two {@link ClavinLocationResolver} objects, one without * context-based heuristic matching and other with it turned on. */ @Before public void setUp() throws ClavinException { resolver = new ClavinLocationResolver(new LuceneGazetteer(new File("./IndexDirectory"))); } private List<ResolvedLocation> resolveNoHeuristics(final List<LocationOccurrence> locs, final boolean fuzzy) throws ClavinException { return resolver.resolveLocations(locs, NO_HEURISTICS_MAX_HIT_DEPTH, NO_HEURISTICS_MAX_CONTEXT_WINDOW, fuzzy); } private List<ResolvedLocation> resolveWithHeuristics(final List<LocationOccurrence> locs, final boolean fuzzy) throws ClavinException { return resolver.resolveLocations(locs, HEURISTICS_MAX_HIT_DEPTH, HEURISTICS_MAX_CONTEXT_WINDOW, fuzzy); } /** * Ensure we select the correct {@link ResolvedLocation} objects * without using context-based heuristic matching. * * Without heuristics, {@link ClavinLocationResolver} will default to * mapping location name Strings to the matching * {@link ResolvedLocation} object with the greatest (sort boosted) population. */ @Test public void testNoHeuristics() throws ClavinException { String[] locations = {"Haverhill", "Worcester", "Springfield", "Kansas City"}; resolvedLocations = resolveNoHeuristics(ClavinLocationResolverTest.makeOccurrencesFromNames(locations), false); assertEquals("LocationResolver chose the wrong \"Haverhill\"", HAVERHILL_MA, resolvedLocations.get(0).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Worcester\"", WORCESTER_MA, resolvedLocations.get(1).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Springfield\"", SPRINGFIELD_MO, resolvedLocations.get(2).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Kansas City\"", KANSAS_CITY_MO, resolvedLocations.get(3).getGeoname().getGeonameID()); } /** * Ensure we select the correct Springfield in a document about * Massachusetts using context-based heuristic matching. */ @Test public void testHeuristicsMassachusetts() throws ClavinException { String[] locations = {"Boston", "Haverhill", "Worcester", "Springfield", "Leominister"}; resolvedLocations = resolveWithHeuristics(ClavinLocationResolverTest.makeOccurrencesFromNames(locations), true); assertEquals("LocationResolver chose the wrong \"Boston\"", BOSTON_MA, resolvedLocations.get(0).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Haverhill\"", HAVERHILL_MA, resolvedLocations.get(1).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Worcester\"", WORCESTER_MA, resolvedLocations.get(2).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Springfield\"", SPRINGFIELD_MA, resolvedLocations.get(3).getGeoname().getGeonameID()); } /** * Ensure we select the correct Springfield in a document about * Illinois using context-based heuristic matching. */ @Test public void testHeuristicsIllinois() throws ClavinException { String[] locations = {"Chicago", "Rockford", "Springfield", "Decatur"}; resolvedLocations = resolveWithHeuristics(ClavinLocationResolverTest.makeOccurrencesFromNames(locations), true); assertEquals("LocationResolver chose the wrong \"Chicago\"", CHICAGO_IL, resolvedLocations.get(0).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Rockford\"", ROCKFORD_IL, resolvedLocations.get(1).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Springfield\"", SPRINGFIELD_IL, resolvedLocations.get(2).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Decatur\"", DECATUR_IL, resolvedLocations.get(3).getGeoname().getGeonameID()); } /** * Ensure we select the correct Springfield in a document about * Missouri using context-based heuristic matching. */ @Test public void testHeuristicsMissouri() throws ClavinException { String[] locations = {"Kansas City", "Springfield", "St. Louis", "Independence"}; resolvedLocations = resolveWithHeuristics(ClavinLocationResolverTest.makeOccurrencesFromNames(locations), true); assertEquals("LocationResolver chose the wrong \"Kansas City\"", KANSAS_CITY_MO, resolvedLocations.get(0).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Springfield\"", SPRINGFIELD_MO, resolvedLocations.get(1).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"St. Louis\"", ST_LOUIS_MO, resolvedLocations.get(2).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Independence\"", INDEPENDENCE_MO, resolvedLocations.get(3).getGeoname().getGeonameID()); } /** * Ensure we select the correct Haverhill in a document about * England using context-based heuristic matching. */ @Test public void testHeuristicsEngland() throws ClavinException { String[] locations = {"London", "Manchester", "Haverhill"}; resolvedLocations = resolveWithHeuristics(ClavinLocationResolverTest.makeOccurrencesFromNames(locations), true); assertEquals("LocationResolver chose the wrong \"London\"", LONDON_UK, resolvedLocations.get(0).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Manchester\"", MANCHESTER_UK, resolvedLocations.get(1).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Haverhill\"", HAVERHILL_UK, resolvedLocations.get(2).getGeoname().getGeonameID()); } /** * Ensure we select the correct London in a document about * Ontario using context-based heuristic matching. */ @Test public void testHeuristicsOntario() throws ClavinException { String[] locations = {"Toronto", "Ottawa", "Hamilton", "Kitchener", "London"}; resolvedLocations = resolveWithHeuristics(ClavinLocationResolverTest.makeOccurrencesFromNames(locations), true); assertEquals("LocationResolver chose the wrong \"Toronto\"", TORONTO_ON, resolvedLocations.get(0).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Ottawa\"", OTTAWA_ON, resolvedLocations.get(1).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Hamilton\"", HAMILTON_ON, resolvedLocations.get(2).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Kitchener\"", KITCHENER_ON, resolvedLocations.get(3).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"London\"", LONDON_ON, resolvedLocations.get(4).getGeoname().getGeonameID()); } /** * Tests some border cases involving the resolver. */ @Test public void testBorderCases() throws ClavinException { // ensure we get no matches for this crazy String String[] locations = {"jhadghaoidhg"}; resolvedLocations = resolveWithHeuristics(ClavinLocationResolverTest.makeOccurrencesFromNames(locations), false); assertTrue("Heuristic LocationResolver fuzzy off, no match", resolvedLocations.isEmpty()); resolvedLocations = resolveWithHeuristics(ClavinLocationResolverTest.makeOccurrencesFromNames(locations), true); assertTrue("Heuristic LocationResolver fuzzy on, no match", resolvedLocations.isEmpty()); } /** * Checks fix of bug where admin1 codes from different countries * were treated as equal. */ @Test public void testHeuristicsNorthAfrica() throws ClavinException { String[] locations = {"Cairo", "Benghazi"}; resolvedLocations = resolveWithHeuristics(ClavinLocationResolverTest.makeOccurrencesFromNames(locations), true); assertEquals("LocationResolver chose the wrong \"Cairo\"", CAIRO_EG, resolvedLocations.get(0).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Benghazi\"", BENGHAZI_LY, resolvedLocations.get(1).getGeoname().getGeonameID()); } /** * Ensure we select the correct Washington in a document about * Washington, DC using context-based heuristic matching. */ @Test public void testHeuristicsDC() throws ClavinException { String[] locations = {"Virginia", "Washington", "Maryland"}; resolvedLocations = resolveWithHeuristics(ClavinLocationResolverTest.makeOccurrencesFromNames(locations), true); assertEquals("LocationResolver chose the wrong \"Virginia\"", VIRGINIA_US, resolvedLocations.get(0).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Washington\"", WASHINGTON_DC, resolvedLocations.get(1).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Maryland\"", MARYLAND_US, resolvedLocations.get(2).getGeoname().getGeonameID()); } /** * Ensure we select the correct Washington in a document about * Washington State using context-based heuristic matching. */ @Test public void testHeuristicsWA() throws ClavinException { String[] locations = {"Seattle", "Washington", "Tacoma"}; resolvedLocations = resolveWithHeuristics(ClavinLocationResolverTest.makeOccurrencesFromNames(locations), true); assertEquals("LocationResolver chose the wrong \"Seattle\"", SEATTLE_WA, resolvedLocations.get(0).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Washington\"", WASHINGTON_STATE_US, resolvedLocations.get(1).getGeoname().getGeonameID()); assertEquals("LocationResolver chose the wrong \"Tacoma\"", TACOMA_WA, resolvedLocations.get(2).getGeoname().getGeonameID()); } }