PlaceCandidate.java example

Explorer

Xponents-master
- Basics
  - src
    - main
      - java
        org
        opensextant
        ConfigException.java
        data
        Country.java
        DocInput.java
        GeoBase.java
        Geocoding.java
        Language.java
        LatLon.java
        Place.java
        Taxon.java
        TextInput.java
        extraction
        ExtractionException.java
        ExtractionMetrics.java
        ExtractionResult.java
        Extractor.java
        MatchFilter.java
        NormalizationException.java
        TextEntity.java
        TextMatch.java
        processing
        Parameters.java
        ProcessingException.java
        util
        AnyFilenameFilter.java
        FileUtility.java
        GeodeticUtility.java
        GeonamesUtility.java
        TextUtils.java
    - test
      - java
        MetricsTest.java
        TestGeoUtils.java
        TestGeonamesLanguages.java
        TestGeonamesMeta.java
        TestTextUtils.java
- Examples
  - src
    - main
      - java
        org
        opensextant
        examples
        BasicGeoTemporalProcessing.java
        TaxonomicTagger.java
        WebCrawl.java
        twitter
        MicroMessage.java
        Tweet.java
        TweetGeocoder.java
- Extraction
  - src
    - main
      - java
        org
        opensextant
        extraction
        SolrMatcherSupport.java
        SolrTaggerRequest.java
        extractors
        geo
        BoundaryObserver.java
        CountryCount.java
        CountryObserver.java
        GazetteerMatcher.java
        GazetteerUpdateProcessorFactory.java
        LocationObserver.java
        PlaceCandidate.java
        PlaceCount.java
        PlaceEvidence.java
        PlaceGeocoder.java
        ScoredPlace.java
        SolrGazetteer.java
        TagFilter.java
        rules
        ContextualOrganizationRule.java
        CoordinateAssociationRule.java
        CountryRule.java
        GeocodeRule.java
        LocationChooserRule.java
        MajorPlaceRule.java
        NameCodeRule.java
        NameRule.java
        NonsenseFilter.java
        PersonNameFilter.java
        ProvinceAssociationRule.java
        xtax
        TaxonMatch.java
        TaxonMatcher.java
        output
        AbstractFormatter.java
        CSVFormatter.java
        FormatterFactory.java
        GDBFormatter.java
        GISDataFormatter.java
        GISDataModel.java
        GeoCSVFormatter.java
        KMLFormatter.java
        OpenSextantSchema.java
        ResultsFormatter.java
        ShapefileFormatter.java
        WKTFormatter.java
        processing
        ResultsUtility.java
        XtractorGroup.java
        progress
        ProgressListener.java
        ProgressMonitor.java
        ProgressMonitorBase.java
        util
        SolrProxy.java
        SolrUtil.java
    - test
      - java
        org
        opensextant
        extractors
        test
        TestExtraction.java
        TestGazFactory.java
        TestGazMatcher.java
        TestGazetteer.java
        TestGazetteerConflationKey.java
        TestPersonFilter.java
        TestPlaceGeocoder.java
        TestPlaceGeocoderLanguages.java
        TestStopFilters.java
        TestUtils.java
        TestXTax.java
- MapReduce
  - src
    - main
      - java
        org
        opensextant
        mapreduce
        AbstractMapper.java
        GeoTaggerMapper.java
        KeywordTaggerMapper.java
        Log4JUtils.java
        LoggingUtilities.java
        XponentsTaggerDemo.java
    - test
      - java
        org
        apache
        solr
        core
        CoreContainer.java
- Patterns
  - src
    - main
      - java
        org
        opensextant
        extractors
        flexpat
        AbstractFlexPat.java
        PatternTestCase.java
        RegexPattern.java
        RegexPatternManager.java
        TextMatchResult.java
        poli
        PatternsOfLife.java
        PoliMatch.java
        PoliPatternManager.java
        TestCase.java
        data
        MACAddress.java
        Money.java
        TelephoneNumber.java
        xcoord
        DMSFilter.java
        DMSOrdinate.java
        GeocoordMatch.java
        GeocoordMatchFilter.java
        GeocoordNormalization.java
        GeocoordPattern.java
        GeocoordPrecision.java
        GeocoordTestCase.java
        Hemisphere.java
        MGRSFilter.java
        MGRSParser.java
        PatternManager.java
        PrecisionScales.java
        UTMParser.java
        XConstants.java
        XCoord.java
        xtemporal
        DateMatch.java
        DateNormalization.java
        DateTimePattern.java
        PatternManager.java
        TestCase.java
        XTConstants.java
        XTemporal.java
    - test
      - java
        org
        opensextant
        extractors
        test
        DateNormalizationTest.java
        PrecisionScalesTest.java
        TestPoLi.java
        TestPoLiReporter.java
        TestXCoord.java
        TestXCoordReporter.java
        TestXTemporal.java
        TestXTemporalReporter.java
- XText
  - examples
  - src
    - main
      - java
        org
        opensextant
        xtext
        Content.java
        ConversionListener.java
        ConvertedDocument.java
        Converter.java
        ExclusionFilter.java
        PathManager.java
        XText.java
        collectors
        ArchiveNavigator.java
        CollectionListener.java
        Collector.java
        mailbox
        DefaultMailCrawl.java
        MailClient.java
        MailConfig.java
        NTLMAuth.java
        OutlookPSTCrawler.java
        sharepoint
        DefaultSharepointCrawl.java
        SPLink.java
        SharepointClient.java
        web
        CrawlFilter.java
        DefaultWebCrawl.java
        HyperLink.java
        WebClient.java
        converters
        ConverterAdapter.java
        DefaultConverter.java
        EmbeddedContentConverter.java
        ImageMetadataConverter.java
        MessageConverter.java
        TextTranscodingConverter.java
        TikaHTMLConverter.java
        WebArchiveConverter.java
    - test
      - java
        org
        opensextant
        xtext
        converters
        test
        MessageConverterTest.java
        test
        Decomposer.java
        ImageGroper.java
        MailClientTest.java
        SharepointClientTest.java
        SharepointCrawlTest.java
        TestPST.java
        TestSPLinks.java
        TestTikaPST.java
        Tests.java
        TextTranscodingTest.java
        WebLinkTest.java
- Xlayer
  - src
    - main
      - java
        org
        opensextant
        xlayer
        Transforms.java
        XlayerClient.java
        server
        RequestParameters.java
        TaggerResource.java
        XlayerApp.java
        xgeo
        XlayerRestlet.java
        XlayerServer.java
        XponentsGeotagger.java
    - test
      - java
        XlayerClientTest.java

/**
 * Copyright 2012-2013 The MITRE Corporation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 *
 *
 * **************************************************************************
 * NOTICE This software was produced for the U. S. Government under Contract No.
 * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
 * Software and Noncommercial Computer Software Documentation Clause
 * 252.227-7014 (JUN 1995)
 *
 * (c) 2012 The MITRE Corporation. All Rights Reserved.
 * **************************************************************************
 *
 */
package org.opensextant.extractors.geo;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.lang.StringUtils;
import org.opensextant.data.Geocoding;
import org.opensextant.data.LatLon;
import org.opensextant.data.Place;
import org.opensextant.extraction.TextMatch;
import org.opensextant.util.TextUtils;

/**
 * A PlaceCandidate represents a portion of a document which has been identified
 * as a possible named geographic location. It is used to collect together the
 * information from the document (the evidence), as well as the possible
 * geographic locations it could represent (the Places ). It also contains the
 * results of the final decision to include:
 * <ul>
 * <li>bestPlace - Of all the places with the same/similar names, which place is
 * it?
 * </ul>
 * @author ubaldino
 * @author dlutz, based on OpenSextant Toolbox
 */
public class PlaceCandidate extends TextMatch {

    private String textnorm = null;

    // --------------Place/NotPlace stuff ----------------------
    // which rules have expressed a Place/NotPlace opinion on this PC
    private final Set<String> rules = new HashSet<>();
    // --------------Disambiguation stuff ----------------------
    // the places along with their disambiguation scores
    private final Map<String, ScoredPlace> scoredPlaces = new HashMap<>();
    // the list of PlaceEvidences accumulated from the document about this PC
    private final List<PlaceEvidence> evidence = new ArrayList<>();
    // The chosen, best place:
    private ScoredPlace choice1 = null;
    private ScoredPlace choice2 = null;
    private int confidence = 0;
    private Set<String> hierarchicalPaths = new HashSet<>();
    private Set<String> countries = new HashSet<>();

    /**
     * Default weighting increments.
     */
    private static final String[] CLASS_SCALE = {
            "A:3",
            "P:2",
            "L:1",
            "R:0",
            "H:1",
            "V:0",
            "T:1"
    };

    private static final String[] DESIGNATION_SCALE = {
            /* Places: cities, villages, ruins, etc.*/
            "PPLC:12",
            "PPLA:8",
            "PPLG:7",
            "PPL:5",
            "PPLL:2",
            "PPLQ:2",
            "PPLX:2",
            /* Administrative regions */
            "ADM1:9",
            "ADM2:8",
            "ADM3:7",
            /* Other geographic features */
            "ISL:4",
            "ISLS:5"
    };

    private static final Map<String, Integer> classWeight = new HashMap<>();
    private static final Map<String, Integer> designationWeight = new HashMap<>();
    private static final int DEFAULT_DESIGNATION_WT = 2;

    static {
        for (String entry : DESIGNATION_SCALE) {
            String[] parts = entry.split(":");
            designationWeight.put(parts[0], Integer.parseInt(parts[1]));
        }
        for (String entry : CLASS_SCALE) {
            String[] parts = entry.split(":");
            classWeight.put(parts[0], Integer.parseInt(parts[1]));
        }
    }

    // basic constructor
    public PlaceCandidate() {
    }

    /**
     * Using a scale of 0 to 100, indicate how confident we are that the chosen place is best.
     * Note this is different than the individual score assigned to each candidate place.
     * We just need one final confidence measure for this place mention.
     */
    public void setConfidence(int c) {
        confidence = c;
    }

    /**
     * see setConfidence
     * 
     * @return
     */
    public int getConfidence() {
        return confidence;
    }

    /**
     * If caller is willing to claim an explicit choice, so be it. Otherwise
     * unchosen places go to disambiguation.
     */
    public void choose(Place geo) {
        if (geo instanceof ScoredPlace) {
            choice1 = (ScoredPlace) geo;
        } else {
            String k = makeKey(geo);
            if (scoredPlaces.containsKey(k)) {
                choice1 = scoredPlaces.get(k);
            }
        }
    }

    /**
     *
     * @return normalized version of text.
     */
    public String getTextnorm() {
        if (textnorm == null) {
            textnorm = TextUtils.removePunctuation(TextUtils.removeDiacritics(getText())).toLowerCase();
        }
        return textnorm;
    }

    // ---- the getters and setters ---------
    //

    private String[] preTokens = null;
    private String[] postTokens = null;
    private final int DEFAULT_TOKEN_SIZE = 40;

    /**
     * Get some sense of tokens surrounding match. Possibly optimize this by
     * getting token list from SolrTextTagger (which provides the
     * lang-specifics)
     *
     * @param sourceBuffer
     */
    protected void setSurroundingTokens(String sourceBuffer) {
        int[] window = TextUtils.get_text_window(start, end - start, sourceBuffer.length(), DEFAULT_TOKEN_SIZE);

        /*
         * Get right most or left most whole tokens, for now whitespace
         * delimited. TODO: ensure whole tokens are retrieved.
         */
        setPrematchTokens(TextUtils.tokensRight(sourceBuffer.substring(window[0], window[1])));
        setPostmatchTokens(TextUtils.tokensLeft(sourceBuffer.substring(window[2], window[3])));
    }

    /**
     * Common evidence flags -- isCountry, isPerson, isOrganization,
     * abbreviation, and acronym
     */
    public boolean isCountry = false;
    public boolean isContinent = false;
    public boolean isPerson = false;
    public boolean isOrganization = false;
    public boolean isAbbreviation = false;
    public boolean isAcronym = false;
    public boolean hasDiacritics = false;

    /**
     * After candidate has been scored and all, the final best place is the
     * geocoding result for the given name in context.
     */
    public Geocoding getGeocoding() {
        choose();
        return getChosen();
    }

    public ScoredPlace getChosen() {
        return choice1;
    }

    public ScoredPlace getFirstChoice() {
        return getChosen();
    }

    /**
     * Get the most highly ranked Place, or Null if empty list.
     * Typical usage:
     * 
     * choose() // this does work. performance cost.
     * getChosen() // this is a getter; no performance cost
     */
    public void choose() {
        if (choice1 != null) {
            // return chosen;
            return;
        }

        List<ScoredPlace> tmp = new ArrayList<>();
        tmp.addAll(scoredPlaces.values());
        Collections.sort(tmp);

        choice1 = tmp.get(0);
        if (tmp.size() > 1) {
            choice2 = tmp.get(1);
            secondPlaceScore = tmp.get(1).getScore();
        }
    }

    /**
     * This only makes sense if you tried choose() first 
     * to sort scored places.
     * 
     * @return
     */
    public boolean isAmbiguous() {
        if (choice2 != null && choice1 != null) {
            // float == float  does this work in Java?  7.125 == 7.125 ? 
            // 
            // first place Not better than second place?
            return !(choice1.getScore() > choice2.getScore());
        }
        return false;
    }

    private double secondPlaceScore = -1;

    /**
     * Only call after choose() operation.
     * 
     * @return
     */
    public double getSecondChoiceScore() {
        return secondPlaceScore;
    }

    public ScoredPlace getSecondChoice() {
        return choice2;
    }

    public Collection<ScoredPlace> getPlaces() {
        return scoredPlaces.values();
    }

    // add a new place with a default score
    public void addPlace(ScoredPlace place) {
        this.addPlace(place, defaultScore(place));
        this.rules.add("DefaultScore");
    }

    public boolean hasDefaultRuleOnly() {
        return rules.contains("DefaultScore") && rules.size() == 1;
    }

    /**
     * Each place has an ID, but this candidate scoring mechanism must score
     * distinct ID+NAME tuples.  As name variances play into scoring and choosing.
     * 
     * @param p
     * @return
     */
    public String makeKey(Place p) {
        return String.format("%s~%s", p.getKey(), p.getNamenorm());
    }

    // add a new place with a specific score
    public void addPlace(ScoredPlace place, Double score) {
        place.setScore(score);
        this.scoredPlaces.put(makeKey(place), place);

        // 'US.CA' or 'US.06', etc.
        this.hierarchicalPaths.add(place.getHierarchicalPath());
        // 'US'
        if (place.getCountryCode() != null) {
            this.countries.add(place.getCountryCode());
        }
    }

    public static final double NAME_WEIGHT = 0.2;
    public static final double FEAT_WEIGHT = 0.1;
    public static final double LOCATION_BIAS_WEIGHT = 0.7;

    /**
     * Given this candidate, how do you score the provided place
     * just based on those place properties (and not on context, document properties,
     * or other evidence)?
     * 
     * This 'should' produce a base score of something between 0 and 1.0, or 0..10.
     * These scores do not necessarily need to stay in that range, as they are all relative.
     * However, as rules fire and compare location data it is better to stay in a known range
     * for sanity sake.
     * 
     * @param g
     * @return
     */
    public double defaultScore(Place g) {
        double sn = scoreName(g);
        double sf = scoreFeature(g);
        double sb = g.getId_bias();

        double baseScore = (NAME_WEIGHT * sn) + (FEAT_WEIGHT * sf) + (LOCATION_BIAS_WEIGHT * sb);
        return 10 * baseScore;
    }

    /**
     * Produce a goodness score in the range 0 to 1.0
     * 
     * Trivial examples of name matching:
     * 
     * <pre>
     *  given some patterns, 'geo' match Text
     * 
     *   case 1. 'Alberta' matches ALBERTA or alberta just fine. 
     *   case 2. 'La' matches LA, however, knowing "LA" is a acronym/abbreviation 
     *       adds to the score of any geo that actually is "LA"
     *   case 3. 'Afghanestan' matches Afghanistan, but decrement because it is not perfectly spelled.
     * 
     * </pre>
     * 
     * @param g
     * @return
     */
    protected double scoreName(Place g) {
        int startingScore = getTextnorm().length();
        int editDist = StringUtils.getLevenshteinDistance(getTextnorm(), g.getNamenorm());
        int score = startingScore - editDist;
        if (isUpper() && (g.isAbbreviation() || TextUtils.isUpper(g.getName()))) {
            ++score;
        }
        // Mismatch in case for abbreviation.
        else if (!isUpper() && g.isAbbreviation()) {
            --score;
        }
        // Mismatch in name diacritics downgrades name score here.
        if ((isASCII() && !g.isASCIIName()) || (!isASCII() && g.isASCIIName())) {
            --score;
        }
        if (isASCII() && g.isASCIIName()) {
            ++score;
        }
        return (float) score / startingScore;
    }

    /**
     * A preference for features that are major places or boundaries.
     * This yields a feature score on a 0 to 1.0 point scale.
     * 
     * @param g
     * @return
     */
    protected double scoreFeature(Place g) {

        Integer wt = designationWeight.get(g.getFeatureCode());
        if (wt != null) {
            return (float) wt / 10;
        }
        int score = DEFAULT_DESIGNATION_WT;
        wt = classWeight.get(g.getFeatureClass());
        if (wt != null) {
            score += wt.intValue();
        }

        return (float) score / 10;
    }

    // increment the score of an existing place
    public void incrementPlaceScore(Place place, Double score) {
        ScoredPlace currentScore = this.scoredPlaces.get(makeKey(place));
        if (currentScore != null) {
            currentScore.incrementScore(score);
        } else {
            // logger.error("Tried to increment a score for a non-existent
            // Place");
        }
    }

    // set the score of an existing place
    public void setPlaceScore(ScoredPlace place, Double score) {
        if (!this.scoredPlaces.containsKey(makeKey(place))) {
            // log.error("Tried to increment a score for a non-existent Place");
            return;
        }
        addPlace(place, score);
    }

    public Collection<String> getRules() {
        return rules;
    }

    public boolean hasRule(String rule) {
        return rules.contains(rule);
    }

    public void addRule(String rule) {
        rules.add(rule);
    }

    public void addEvidence(PlaceEvidence evidence) {
        this.evidence.add(evidence);
        if (evidence.getRule() != null) {
            this.rules.add(evidence.getRule());
        }
    }

    public void addEvidence(String rule, double weight, Place ev) {
        addEvidence(new PlaceEvidence(ev, rule, weight));
    }

    // some convenience methods to add evidence
    public void addEvidence(String rule, double weight, String cc, String adm1, String fclass, String fcode,
            LatLon geo) {
        PlaceEvidence ev = new PlaceEvidence();
        ev.setRule(rule);
        ev.setWeight(weight);
        if (cc != null) {
            ev.setCountryCode(cc);
        }
        if (adm1 != null) {
            ev.setAdmin1(adm1);
        }
        if (fclass != null) {
            ev.setFeatureClass(fclass);
        }
        if (fcode != null) {
            ev.setFeatureCode(fcode);
        }
        if (geo != null) {
            ev.setLatLon(geo);
        }
        this.evidence.add(ev);
    }

    /**
     * Add country evidence and increment score immediately.
     * 
     * @param rule
     * @param weight
     * @param cc
     * @param geo
     */
    public void addCountryEvidence(String rule, double weight, String cc, Place geo) {
        PlaceEvidence ev = new PlaceEvidence();
        ev.setRule(rule);
        ev.setWeight(weight);
        ev.setCountryCode(cc);
        this.evidence.add(ev);

        ev.setEvaluated(true);
        this.incrementPlaceScore(geo, /*1 x */ weight);
    }

    public void addAdmin1Evidence(String rule, double weight, String adm1, String cc) {
        PlaceEvidence ev = new PlaceEvidence();
        ev.setRule(rule);
        ev.setWeight(weight);
        ev.setAdmin1(adm1);
        ev.setCountryCode(cc);
        this.evidence.add(ev);
    }

    public void addFeatureClassEvidence(String rule, double weight, String fclass) {
        PlaceEvidence ev = new PlaceEvidence();
        ev.setRule(rule);
        ev.setWeight(weight);
        ev.setFeatureClass(fclass);
        this.evidence.add(ev);
    }

    public void addFeatureCodeEvidence(String rule, double weight, String fcode) {
        PlaceEvidence ev = new PlaceEvidence();
        ev.setRule(rule);
        ev.setWeight(weight);
        ev.setFeatureCode(fcode);
        this.evidence.add(ev);
    }

    /**
     * Add evidence and increment score immediately.
     * 
     * @param rule
     * @param weight
     * @param coord
     * @param geo
     * @param proximityScore
     */
    public void addGeocoordEvidence(String rule, double weight, LatLon coord, Place geo, double proximityScore) {
        PlaceEvidence ev = new PlaceEvidence();
        ev.setRule(rule);
        ev.setWeight(weight);
        ev.setLatLon(coord);
        this.evidence.add(ev);
        //
        ev.setEvaluated(true);
        this.incrementPlaceScore(geo, weight * proximityScore);
        // The indirect connection between found coord and closest geo candidate 
        // is assessed here.  The score for geo has already be incremented.
    }

    public List<PlaceEvidence> getEvidence() {
        return this.evidence;
    }

    public boolean hasPlaces() {
        return !this.scoredPlaces.isEmpty();
    }

    // an overide of toString to get a meaningful representation of this PC
    @Override
    public String toString() {
        return summarize(false);
    }

    /**
     * If you need a full print out of the data, use summarize(true);
     * 
     * @param dumpAll
     * @return
     */
    public String summarize(boolean dumpAll) {
        StringBuilder tmp = new StringBuilder(getText());
        tmp.append(String.format("(C=%d, N=%d)", this.getConfidence(), this.scoredPlaces.size()));
        tmp.append("\nRules=");
        tmp.append(rules.toString());
        tmp.append("\nEvidence=");
        tmp.append(evidence.toString());
        if (dumpAll) {
            tmp.append("\nPlaces=\n");
            for (ScoredPlace p : scoredPlaces.values()) {
                tmp.append("\t");
                tmp.append(p.toString());
                tmp.append("\n");
            }
        }
        return tmp.toString();
    }

    /**
     * @return the preTokens
     */
    public String[] getPrematchTokens() {
        return preTokens;
    }

    /**
     * @param tok
     *            the preTokens to set
     */
    public void setPrematchTokens(String[] tok) {
        this.preTokens = tok;
    }

    /**
     * @return the postTokens
     */
    public String[] getPostmatchTokens() {
        return postTokens;
    }

    /**
     * @param tok
     *            the postTokens to set
     */
    public void setPostmatchTokens(String[] tok) {
        this.postTokens = tok;
    }

    /**
     * Given a path, 'a.b' ( province b in country a),
     * see if this name is present there.
     * 
     * @param path
     * @return
     */
    public boolean presentInHierarchy(String path) {
        return this.hierarchicalPaths.contains(path);
    }

    public boolean presentInCountry(String cc) {
        return this.countries.contains(cc);
    }

    /**
     * How many different countries contain this name?
     * 
     * @return
     */
    public int distinctCountryCount() {
        return this.countries.size();
    }

    public int distinctLocationCount() {
        return this.scoredPlaces.size(); // These are keyed by PLACE ID, essentially location.
    }

}