/**
* Copyright 2014 The MITRE Corporation.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
*/
package org.opensextant.extractors.geo.rules;
import java.util.List;
import org.opensextant.data.Place;
import org.opensextant.extractors.geo.PlaceCandidate;
import org.opensextant.extractors.geo.PlaceEvidence;
/**
* A rule that associates a CODE with a NAME, when the pattern
*
* "NAME, CODE" appears within N characters of each other.
*
* If CODE.adm1 == NAME.adm1 and CODE is an ADM1 boundary, then flag this is significant.
*
*
*
* @author ubaldino
*
*/
public class NameCodeRule extends GeocodeRule {
private static int MAX_CHAR_DIST = 4;
public final static String NAME_ADMCODE_RULE = "AdminCode";
public final static String NAME_ADMNAME_RULE = "AdminName";
public NameCodeRule() {
NAME = "AdminCodeOrName";
weight = 3;
}
/**
* Requirement: List of place candidate is a linked list.
*/
@Override
public void evaluate(final List<PlaceCandidate> names) {
for (int x = 0; x < names.size() - 1; ++x) {
PlaceCandidate name = names.get(x);
PlaceCandidate code = names.get(x + 1); /* code or name of admin area*/
if (name.isFilteredOut() || code.isFilteredOut()) {
continue;
}
/*
* COUNTRY, STATE is not supported under this rule.
* E.g., Uruguay, Argentina ... This looks like a list of countries
* However Uruguay is a district in Argentina; Just as Georgia is a state in US
* and also a country name.
*/
if (name.isCountry) {
continue;
}
/*
* Test if SOMENAME, CODE is the case. a1.....a2.b1.., where b1 > a2
* > a1, but distance is minimal from end of name to start of code.
*
*/
if ((code.start - name.end) > MAX_CHAR_DIST) {
continue;
}
/*
* Not supporting lowercase codes/abbreviations. 'la', 'is', 'un', etc.
*/
if (code.isLower() && code.getText().length()<4) {
continue;
}
boolean comma = false;
if (name.getPostmatchTokens() != null) {
// Parse tokens or text following NAME.... CODE
// Proximity is one factor, but conventional format should weigh more.
if (",".equals(name.getPostmatchTokens()[0])) {
comma = true;
}
}
/*
* by this point a place name tag should be marked as a name or
* code/abbrev. Match the abbreviation with a geographic location
* that is a state, county, district, etc.
*/
Place country = code.isCountry ? code.getChosen() : null;
log.debug("{} name, code: {} in {}?", NAME, name.getText(), code.getText());
for (Place geo : code.getPlaces()) {
if (!geo.isAdministrative() || geo.getCountryCode() == null) {
continue;
}
// Provinces, states, districts, etc. Only.
//
// Make sure you can match an province name or code with the gazetteer entries found:
// Boston, Ma. ==== for 'Ma', resolve to an abbreviation for Massachusetts
// Ignore places called 'Ma'
//
// Place ('Ma') == will have gazetteer metadata indicating if this is a valid abbreviated code for a place.
// PlaceCandidate('Ma.') will have textual metadata from given text indicating if it is a code, MA, or abbrev. 'Ma.'
//
// These two situations must match here. We ignore geo locations that do not fit this profile.
//
boolean lexicalMatch = ((code.isAbbreviation && geo.isAbbreviation()) ||
(!code.isAbbreviation && !geo.isAbbreviation()));
//
if (!lexicalMatch) {
continue;
}
String adm1 = geo.getHierarchicalPath();
if (adm1 == null && !code.isCountry) {
log.debug("ADM1 hierarchical path should not be null");
continue;
}
// Quick determination if these two places have a containment or geopolitical connection
//
boolean contains = name.presentInHierarchy(adm1)
|| (country != null ? name.presentInCountry(country.getCountryCode()) : false);
if (!contains) {
continue;
}
/* CITY, STATE
* CITY, COUNTRY
*/
// Associate the CODE to the NAME that precedes it.
//
PlaceEvidence ev = new PlaceEvidence();
ev.setCountryCode(geo.getCountryCode());
ev.setAdmin1(geo.getAdmin1());
ev.setEvaluated(true); // Shunt. Evaluate this rule here.
int wt = weight + (comma ? 2 : 0);
if (geo.isAbbreviation() && (code.isAbbreviation || code.isAcronym)) {
ev.setRule(NAME_ADMCODE_RULE);
ev.setWeight(wt + 1);
} else {
ev.setRule(NAME_ADMNAME_RULE);
ev.setWeight(wt);
}
name.addEvidence(ev);
if (boundaryObserver != null) {
boundaryObserver.boundaryLevel1InScope(geo);
}
// Now choose which location for CITY (name) best suits this.
// Actually increase score for all geos that match the criteria.
//
for (Place nameGeo : name.getPlaces()) {
if (!(nameGeo.isPopulated() || nameGeo.isAdministrative() || nameGeo.isSpot())) {
continue;
}
if (adm1 != null && adm1.equals(nameGeo.getHierarchicalPath())) {
name.incrementPlaceScore(nameGeo, ev.getWeight());
} else if (sameCountry(nameGeo, country)) {
name.incrementPlaceScore(nameGeo, ev.getWeight());
}
}
}
}
}
/**
* No-op.
*/
@Override
public void evaluate(PlaceCandidate name, Place geo) {
// no-op
}
}