/* Copyright 2009-2013 The MITRE Corporation. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. * ************************************************************************** * NOTICE * This software was produced for the U. S. Government under Contract No. * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer * Software and Noncommercial Computer Software Documentation Clause * 252.227-7014 (JUN 1995) * * (c) 2012 The MITRE Corporation. All Rights Reserved. * ************************************************************************** **/ package org.opensextant.toolbox; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.opensextant.placedata.PlaceCandidate; import org.opensextant.placedata.PlaceEvidence; import org.opensextant.placedata.PlaceEvidence.Scope; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import gate.Annotation; import gate.AnnotationSet; import gate.ProcessingResource; import gate.Resource; import gate.creole.AbstractLanguageAnalyser; import gate.creole.ExecutionException; import gate.creole.ResourceInstantiationException; import gate.creole.metadata.CreoleParameter; import gate.creole.metadata.CreoleResource; import gate.creole.metadata.Optional; import gate.creole.metadata.RunTime; /** * This is the GATE ProcessingResource wrapper for the Cantilever class. It * performs a geospatial-specific form of co-referencing. It also propagates * evidence from a geospatial entity to co-references of that entity. */ @CreoleResource(name = "OpenSextant Cantilever Processor", comment = "A plugin that performs a simple form of " + "co-reference matching among geospatial entities ") public class CantileverPR extends AbstractLanguageAnalyser implements ProcessingResource { private static final long serialVersionUID = -5055098862407377701L; private String outputAnnotationSet; private String candidateAnnotationName; private String candidateFeatureName; private boolean doCoref = true; private transient Cantilever cntlvr; private transient Scorer scr; private static final Logger LOGGER = LoggerFactory.getLogger(CantileverPR.class); private void initialize() { cntlvr = new Cantilever(); scr = new Scorer(); } @Override public Resource init() throws ResourceInstantiationException { initialize(); return this; } @Override public void reInit() throws ResourceInstantiationException { initialize(); } /** * This does the actual work of analyzing the evidence attached to the place * candidates. It invokes Cantilever's <code>propagateEvidence()</code> for * the place candidates found in each document. * * @throws ExecutionException */ @Override public void execute() throws ExecutionException { // the list of PC objects List<PlaceCandidate> pcList = new ArrayList<PlaceCandidate>(); // get the annotation set AnnotationSet annotSet = (outputAnnotationSet == null || "".equals(outputAnnotationSet)) ? document.getAnnotations() : document.getAnnotations(outputAnnotationSet); // Get all of the placecandidate annotations AnnotationSet candidateSet = annotSet.get(candidateAnnotationName); // add each of the PlaceCandidate objs which are attached to the // annotations to the list for (Annotation candAnno : candidateSet) { // get the PlaceCandidate object PlaceCandidate pc = (PlaceCandidate) candAnno.getFeatures().get(candidateFeatureName); if (pc != null) { pcList.add(pc); } else { LOGGER.error("Null PC on annotation" + candAnno); } } // end place candidate loop // enable/disable co-referencing if (doCoref) { // do the coreferencing and propagate the evidence amongst the PCs cntlvr.propagateEvidence(pcList); // collect the document level evidence List<PlaceEvidence> docEvidList = collectDocumentEvidence(annotSet); // attach document level evidence to scorer scr.setDocumentLevelEvidence(docEvidList); } // score and rank the Places in each PC according to the evidence scr.score(pcList); } /** End execute. */ public String getOutputAnnotationSet() { return outputAnnotationSet; } @Optional @RunTime @CreoleParameter public void setOutputAnnotationSet(String outputAnnotationSet) { this.outputAnnotationSet = outputAnnotationSet; } public String getCandidateAnnotationName() { return candidateAnnotationName; } @Optional @RunTime @CreoleParameter(defaultValue = "placecandidate") public void setCandidateAnnotationName(String candidateAnnotationName) { this.candidateAnnotationName = candidateAnnotationName; } public String getCandidateFeatureName() { return candidateFeatureName; } @Optional @RunTime @CreoleParameter(defaultValue = "placeCandidate") public void setCandidateFeatureName(String candidateFeatureName) { this.candidateFeatureName = candidateFeatureName; } private List<PlaceEvidence> collectDocumentEvidence(AnnotationSet annotSet) { double countryWeight = 0.1; double adminWeight = 0.05; // document level country and admin1 evidence List<PlaceEvidence> docEvidList = new ArrayList<PlaceEvidence>(); // collect all of the countries, capitals and admin1s mentioned AnnotationSet countrySet = annotSet.get("Country"); AnnotationSet capitalSet = annotSet.get("NationalCapital"); AnnotationSet adminSet = annotSet.get("Admin1"); // how many times has a country been mentioned Map<String, Integer> countryCounts = new HashMap<String, Integer>(); Map<String, Double> countryBiases = new HashMap<String, Double>(); double countryTotal = 0.0; // how many times has an admin1 been mentioned Map<String, Integer> adminCounts = new HashMap<String, Integer>(); Map<String, Double> adminBiases = new HashMap<String, Double>(); double adminTotal = 0.0; // collect country counts for (Annotation countryAnno : countrySet) { String tmpCC = (String) countryAnno.getFeatures().get("countryCode"); String countryName = (String) countryAnno.getFeatures().get("string"); // TODO replace with abbreviation when added to gazetteer // don't use countries when only country code if (countryName != null && countryName.length() > 3) { if (!countryCounts.keySet().contains(tmpCC)) { countryCounts.put(tmpCC, 0); } countryCounts.put(tmpCC, countryCounts.get(tmpCC) + 1); countryTotal = countryTotal + 1; } } // collect capital counts for (Annotation capitalAnno : capitalSet) { String tmpCC = (String) capitalAnno.getFeatures().get("countryCode"); String capitalName = (String) capitalAnno.getFeatures().get("string"); // TODO replace with abbreviation when added to gazetteer // don't use capital when only code/short if (capitalName != null && capitalName.length() > 3) { if (!countryCounts.keySet().contains(tmpCC)) { countryCounts.put(tmpCC, 0); } countryCounts.put(tmpCC, countryCounts.get(tmpCC) + 1); countryTotal = countryTotal + 1; } } // collect admin counts for (Annotation adminAnno : adminSet) { String tmpCC = (String) adminAnno.getFeatures().get("countryCode"); String tmpAdmCode = (String) adminAnno.getFeatures().get("adm1code"); String adminName = (String) adminAnno.getFeatures().get("string"); String adminKey = tmpCC + "/" + tmpAdmCode; // TODO replace with abbreviation when added to gazetteer // don't use countries/admin when only admin code if (adminName != null && adminName.length() > 3 && tmpAdmCode != null && tmpAdmCode.length() > 1) { if (!countryCounts.keySet().contains(tmpCC)) { countryCounts.put(tmpCC, 0); } if (!adminCounts.keySet().contains(adminKey)) { adminCounts.put(adminKey, 0); } countryCounts.put(tmpCC, countryCounts.get(tmpCC) + 1); countryTotal = countryTotal + 1; adminCounts.put(adminKey, adminCounts.get(adminKey) + 1); adminTotal = adminTotal + 1; } } // normalize country and admin counts by total seen for (String cc : countryCounts.keySet()) { countryBiases.put(cc, countryCounts.get(cc) / countryTotal); } for (String ac : adminCounts.keySet()) { adminBiases.put(ac, adminCounts.get(ac) / adminTotal); } // create document level evidence based on the countries and admin1s // seen for (String cc : countryBiases.keySet()) { PlaceEvidence ccEvid = new PlaceEvidence(); ccEvid.setCountryCode(cc); ccEvid.setScope(Scope.DOCUMENT); ccEvid.setWeight(countryWeight * countryBiases.get(cc)); ccEvid.setRule("CountryBias"); docEvidList.add(ccEvid); } for (String adminKey : adminBiases.keySet()) { PlaceEvidence acEvid = new PlaceEvidence(); String[] pieces = adminKey.split("/"); acEvid.setCountryCode(pieces[0]); acEvid.setAdmin1(pieces[1]); acEvid.setScope(Scope.DOCUMENT); acEvid.setWeight(adminWeight * adminBiases.get(adminKey)); acEvid.setRule("AdminBias"); docEvidList.add(acEvid); } return docEvidList; } } // end class