MGRSParser.java example

Explorer

Xponents-master
- Basics
  - src
    - main
      - java
        org
        opensextant
        ConfigException.java
        data
        Country.java
        DocInput.java
        GeoBase.java
        Geocoding.java
        Language.java
        LatLon.java
        Place.java
        Taxon.java
        TextInput.java
        extraction
        ExtractionException.java
        ExtractionMetrics.java
        ExtractionResult.java
        Extractor.java
        MatchFilter.java
        NormalizationException.java
        TextEntity.java
        TextMatch.java
        processing
        Parameters.java
        ProcessingException.java
        util
        AnyFilenameFilter.java
        FileUtility.java
        GeodeticUtility.java
        GeonamesUtility.java
        TextUtils.java
    - test
      - java
        MetricsTest.java
        TestGeoUtils.java
        TestGeonamesLanguages.java
        TestGeonamesMeta.java
        TestTextUtils.java
- Examples
  - src
    - main
      - java
        org
        opensextant
        examples
        BasicGeoTemporalProcessing.java
        TaxonomicTagger.java
        WebCrawl.java
        twitter
        MicroMessage.java
        Tweet.java
        TweetGeocoder.java
- Extraction
  - src
    - main
      - java
        org
        opensextant
        extraction
        SolrMatcherSupport.java
        SolrTaggerRequest.java
        extractors
        geo
        BoundaryObserver.java
        CountryCount.java
        CountryObserver.java
        GazetteerMatcher.java
        GazetteerUpdateProcessorFactory.java
        LocationObserver.java
        PlaceCandidate.java
        PlaceCount.java
        PlaceEvidence.java
        PlaceGeocoder.java
        ScoredPlace.java
        SolrGazetteer.java
        TagFilter.java
        rules
        ContextualOrganizationRule.java
        CoordinateAssociationRule.java
        CountryRule.java
        GeocodeRule.java
        LocationChooserRule.java
        MajorPlaceRule.java
        NameCodeRule.java
        NameRule.java
        NonsenseFilter.java
        PersonNameFilter.java
        ProvinceAssociationRule.java
        xtax
        TaxonMatch.java
        TaxonMatcher.java
        output
        AbstractFormatter.java
        CSVFormatter.java
        FormatterFactory.java
        GDBFormatter.java
        GISDataFormatter.java
        GISDataModel.java
        GeoCSVFormatter.java
        KMLFormatter.java
        OpenSextantSchema.java
        ResultsFormatter.java
        ShapefileFormatter.java
        WKTFormatter.java
        processing
        ResultsUtility.java
        XtractorGroup.java
        progress
        ProgressListener.java
        ProgressMonitor.java
        ProgressMonitorBase.java
        util
        SolrProxy.java
        SolrUtil.java
    - test
      - java
        org
        opensextant
        extractors
        test
        TestExtraction.java
        TestGazFactory.java
        TestGazMatcher.java
        TestGazetteer.java
        TestGazetteerConflationKey.java
        TestPersonFilter.java
        TestPlaceGeocoder.java
        TestPlaceGeocoderLanguages.java
        TestStopFilters.java
        TestUtils.java
        TestXTax.java
- MapReduce
  - src
    - main
      - java
        org
        opensextant
        mapreduce
        AbstractMapper.java
        GeoTaggerMapper.java
        KeywordTaggerMapper.java
        Log4JUtils.java
        LoggingUtilities.java
        XponentsTaggerDemo.java
    - test
      - java
        org
        apache
        solr
        core
        CoreContainer.java
- Patterns
  - src
    - main
      - java
        org
        opensextant
        extractors
        flexpat
        AbstractFlexPat.java
        PatternTestCase.java
        RegexPattern.java
        RegexPatternManager.java
        TextMatchResult.java
        poli
        PatternsOfLife.java
        PoliMatch.java
        PoliPatternManager.java
        TestCase.java
        data
        MACAddress.java
        Money.java
        TelephoneNumber.java
        xcoord
        DMSFilter.java
        DMSOrdinate.java
        GeocoordMatch.java
        GeocoordMatchFilter.java
        GeocoordNormalization.java
        GeocoordPattern.java
        GeocoordPrecision.java
        GeocoordTestCase.java
        Hemisphere.java
        MGRSFilter.java
        MGRSParser.java
        PatternManager.java
        PrecisionScales.java
        UTMParser.java
        XConstants.java
        XCoord.java
        xtemporal
        DateMatch.java
        DateNormalization.java
        DateTimePattern.java
        PatternManager.java
        TestCase.java
        XTConstants.java
        XTemporal.java
    - test
      - java
        org
        opensextant
        extractors
        test
        DateNormalizationTest.java
        PrecisionScalesTest.java
        TestPoLi.java
        TestPoLiReporter.java
        TestXCoord.java
        TestXCoordReporter.java
        TestXTemporal.java
        TestXTemporalReporter.java
- XText
  - examples
  - src
    - main
      - java
        org
        opensextant
        xtext
        Content.java
        ConversionListener.java
        ConvertedDocument.java
        Converter.java
        ExclusionFilter.java
        PathManager.java
        XText.java
        collectors
        ArchiveNavigator.java
        CollectionListener.java
        Collector.java
        mailbox
        DefaultMailCrawl.java
        MailClient.java
        MailConfig.java
        NTLMAuth.java
        OutlookPSTCrawler.java
        sharepoint
        DefaultSharepointCrawl.java
        SPLink.java
        SharepointClient.java
        web
        CrawlFilter.java
        DefaultWebCrawl.java
        HyperLink.java
        WebClient.java
        converters
        ConverterAdapter.java
        DefaultConverter.java
        EmbeddedContentConverter.java
        ImageMetadataConverter.java
        MessageConverter.java
        TextTranscodingConverter.java
        TikaHTMLConverter.java
        WebArchiveConverter.java
    - test
      - java
        org
        opensextant
        xtext
        converters
        test
        MessageConverterTest.java
        test
        Decomposer.java
        ImageGroper.java
        MailClientTest.java
        SharepointClientTest.java
        SharepointCrawlTest.java
        TestPST.java
        TestSPLinks.java
        TestTikaPST.java
        Tests.java
        TextTranscodingTest.java
        WebLinkTest.java
- Xlayer
  - src
    - main
      - java
        org
        opensextant
        xlayer
        Transforms.java
        XlayerClient.java
        server
        RequestParameters.java
        TaggerResource.java
        XlayerApp.java
        xgeo
        XlayerRestlet.java
        XlayerServer.java
        XponentsGeotagger.java
    - test
      - java
        XlayerClientTest.java

/**
 *
* Copyright 2012-2013 The MITRE Corporation.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * **************************************************************************
 *                          NOTICE
 * This software was produced for the U. S. Government under Contract No.
 * W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
 * Software and Noncommercial Computer Software Documentation Clause
 * 252.227-7014 (JUN 1995)
 *
 * (c) 2012 The MITRE Corporation. All Rights Reserved.
 * **************************************************************************
 */
package org.opensextant.extractors.xcoord;

import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import org.opensextant.geodesy.MGRS;
import org.opensextant.util.TextUtils;

/**
 *
 * @author ubaldino
 */
public class MGRSParser {

    /**
     * Given the match parse MGRS as best as can be done.
     * TODO: provide level of confidence.  Items that match MGRS scheme perfectly are more likely to be MGRS than those that
     * are not perfect matches, e.g. typos, inadvertent text wrapping, whitespace etc.
     *
     * @param rawtext the rawtext
     * @param _text text normalized, optionally
     * @param elements matched groups within regex pattern
     * @return array of possible MGRS interpretations.
     */
    public static MGRS[] parseMGRS(String rawtext, String _text, Map<String, String> elements) {
        // pad MGRS
        // remove whitespace
        // set MGRS
        // set lat, lon

        String text = null;
        if (_text == null) {
            text = TextUtils.delete_whitespace(rawtext);
        } else {
            text = _text;
        }

        // Filter out trivial DD DEG MM pattern.
        // This may not be an issue -- how prevalent is the DD DEG MM DMS pattern?
        // Trivial test: 44 DEG 34 is not an MGRS pattern.
        if (text.length() < 6) {
            // less than 6 chars long this is either a zone with no offset
            //  or some sort of false positive.  Pattern should not match this
            return null;
        }

        if (text.length() < 8) {
            String _test = text.substring(2, 5);

            if (_test.equalsIgnoreCase("DEG")) {
                return null;
            }
        }

        // If we matched an obvious and invalid month
        // as an MGRS, then fail early.  Otherwise MGRSFilter
        // will parse out more complex patterns that are date + time
        // NOTE: an MGRS pattern may indeed look like a date+time in some cases but it
        // can actually be a valid MGRS. Take care not to filter out too aggressively.
        if (filterOutMonths(text)) {
            return null;
        }

        String gzd = elements.get("MGRSZone");

        /*
         * Gridzone required.
         */
        if (gzd == null) {
            return null;
        }

        // GZD Rule:  00 not allowed in 5-digit GZD
        //             0 not allowed in 4-digit
        int num1 = parseInt(gzd.substring(0, 1));
        int num2 = parseInt(gzd.substring(0, 2));

        if (num2 == 0 || (num1 == 0 && gzd.length() == 2)) {
            return null;
        }

        if (num1 < 0) {
            // Pattern should have never matched.
            return null;
        }

        // GZD Rule numbered zones not greate than 60
        if (num2 > 60) {
            return null;
        }

        //---------------------------------------|
        //
        // MGRS precision is 1m.  Quad is 100,000m sq so resolution is 5 digits + 5 digits with optional whitespace
        // 99999n 99999e  -- in MGRS we never see "m" units or N/E denoted explicitly
        // Occassionally, newlines or whitespace are interspersed in offset
        // minimal:
        // dd
        // ddddd ddddd  with an additional one or two white spaces.   The offsets start and end with numbers. Only whitespace between is optional.
        // ddddd dddddd  additional digit in Easting  -- trailing 6th digit is a typo; trim off
        // dddddd ddddd  additional digit in Northing -- trailing 6th digit is a typo; trim off
        // ddddddddddd   Typo introduces ambiguity -- only correct thing is to split on halfway point +/- 1 digit and emit two answers
        // dd\nddd ddddd  Newline early in offset
        //---------------------------------------|
        String ne = elements.get("Easting_Northing");
        int digits = TextUtils.count_digits(ne);
        boolean odd_len = ((digits & 0x0001) == 1);

        if (!isValidEastingNorthing(ne, odd_len)) {
            return null;
        }

        if (!odd_len) {
            //----------------------------
            // Completely normal MGRS with even number of digits.
            //
            // By this point you should have passed in normalized coordinate text - no whitespace
            //----------------------------
            //
            return new MGRS[] { new MGRS(text) };
        } else {
            //----------------------------
            // Slightly obscure case that is possibly a typo or Easting/Northing disturbed.
            //
            // The following logic for parsing is predominantly related to managing typos and rare cases.
            // < 5% of the instances seen fall into this category.
            //
            //----------------------------

            int space_count = TextUtils.count_ws(ne);
            String nenorm;
            String Q = elements.get("MGRSQuad");

            StringBuilder mgrs1 = null;

            if (space_count == 0) {
                nenorm = ne;

                // ddddddddd   odd number of digits, no spaces.
                // answer 1:  dddd ddddd  ==> N=dddd0
                // answer 2:  ddddd dddd  ==> E=dddd0
                int midpoint = (nenorm.length() / 2);
                mgrs1 = new StringBuilder(ne);
                mgrs1.insert(midpoint, "0"); // N=dddd0,  add 0
                mgrs1.insert(0, Q);
                mgrs1.insert(0, gzd);

                StringBuilder mgrs2 = new StringBuilder(ne);
                mgrs2.append("0"); // E=dddd0  add 0
                mgrs2.insert(0, Q);
                mgrs2.insert(0, gzd);

                return new MGRS[] { new MGRS(mgrs1.toString()), new MGRS(mgrs2.toString()) };
            }

            nenorm = TextUtils.squeeze_whitespace(ne);
            space_count = TextUtils.count_ws(nenorm);
            int ws_index = nenorm.indexOf(" ");
            int midpoint = (nenorm.length() / 2);

            // Even Split -- meaning easting northing appear to be good. But one needs to be fixed.
            // boolean even_split = Math.abs( midpoint - ws_index ) <= 1;
            // Given one of
            // dddd ddddd
            // ddddd dddd
            // dd ddddddd
            // where whitespace is ' ' or '\n' or '\r', etc.

            // GIVEN: dddd ddddd
            if (space_count == 1 && (ws_index + 1) == midpoint) {
                mgrs1 = new StringBuilder(nenorm);
                // ANSWER: dddd0 ddddd
                mgrs1.insert(ws_index, "0");
                mgrs1.insert(0, Q);
                mgrs1.insert(0, gzd);

                // Just one answer:

                return new MGRS[] { new MGRS(TextUtils.delete_whitespace(mgrs1.toString())) };
            }

            if (space_count == 1 && (ws_index == midpoint)) {

                mgrs1 = new StringBuilder(nenorm);
                // ANSWER: ddddd dddd0
                mgrs1.append("0");
                mgrs1.insert(0, Q);
                mgrs1.insert(0, gzd);

                return new MGRS[] { new MGRS(TextUtils.delete_whitespace(mgrs1.toString())) };
            }

            // Given
            //   ddd dd d
            //   ddddd ddd dd
            //   etc.
            //   You have a bunch of MGRS digits broken up by whitespace.
            //   This is really obscure case where formatting or content conversion
            //      or word processing interferred with the MGRS text.
            //
            //  This is < 0.1% of the cases
            //
            nenorm = TextUtils.delete_whitespace(ne);
            // ddddddddd   odd number of digits, no spaces.
            // answer 1:  dddd ddddd  ==> N=dddd0
            // answer 2:  ddddd dddd  ==> E=dddd0
            midpoint = (nenorm.length() / 2);
            mgrs1 = new StringBuilder(nenorm);
            mgrs1.insert(midpoint, "0"); // N=dddd0,  add 0
            mgrs1.insert(0, Q);
            mgrs1.insert(0, gzd);

            StringBuilder mgrs2 = new StringBuilder(nenorm);
            mgrs2.append("0"); // E=dddd0  add 0
            mgrs2.insert(0, Q);
            mgrs2.insert(0, gzd);

            return new MGRS[] { new MGRS(mgrs1.toString()), new MGRS(mgrs2.toString()) };
        }
    }

    /**
     * A hueuristic from looking at real data, real text artifacts - typos, line endings, whitespace wrapping, etc.
     *
     * Acceptable Northing/Eastings:
     * dd dd
     * dddd dddd
     *
     * typos: (odd number of digits;  whitespace or not.)
     * ddd dd
     * ddddd
     *
     * Not valid:
     *
     * dd dd\nd   odd digits and has line endings
     *
     * @param ne NE string, e.g,. 56789 01234
     * @param oddLength if len is odd
     * @return if easting/northing is valid
     */
    protected static boolean isValidEastingNorthing(String ne, boolean oddLength) {
        // PARSE RULE:  ignore abnormal MGRS patterns with line endings in the match
        //
        //  The MGRS easting/northing is messy and contains line endings.
        //  Abort. This is not likely an MGRS worth anything.
        //

        boolean containsEOL = (ne.contains("\n") || ne.contains("\r"));
        boolean containsTAB = ne.contains("\t");
        if (oddLength) {
            return !(containsEOL || containsTAB);
        }

        int wsCount = TextUtils.count_ws(ne);

        // NO:
        // dd dd\ndd
        // YES:  normal text wrap on offset.
        // dd\ndd
        if (wsCount > 1 && containsEOL) {
            return false;
        }
        if (wsCount > 2) {
            return false;
        }

        return true;
    }

    /**
     *
     * @param x an integer string
     * @return int for the string
     */
    protected static int parseInt(String x) {
        try {
            return Integer.parseInt(x);
        } catch (Exception e) {
            return -1;
        }
    }

    /**
     * While date/month patterns match the MGRS format, there are certain months that are just too common
     * to believe they are relevant MGRS patterns.
     *
     */
    private static final Set<String> ignoreMonths = new HashSet<String>();
    static {
        ignoreMonths.add("jan");  // Lat band that is mostly water; Southern Africa
        ignoreMonths.add("feb");  // ditto; almost always water.
        //ignoreMonths.add("mar");  // Valid Congo, Brazil.

        ignoreMonths.add("apr");  // Invalid zone, first letter is C-X; Not likely to ever match
        ignoreMonths.add("aug");  // ditto

        // Other months, however have to be parsed. If they are dates
        // AND runtime flags have MGRS Filters enabled, then dates will be filtered out usually.
        //
    }

    /**
     * Filter out well-known date patterns that are not valid MGRS;
     * MGRS Filter may additionally parse out more patterns. But we generate an MGRS object here
     * we can filter such things out ahead of time, avoiding the inevitable exception.
     * @param t
     * @return
     */
    private static boolean filterOutMonths(String t) {

        String raw = t.toLowerCase();
        String t1 = raw.substring(2, 5);
        if (ignoreMonths.contains(t1)) {
            return true;
        }
        t1 = t.substring(1, 4);
        if (ignoreMonths.contains(t1)) {
            return true;
        }
        return false;

    }

}