/**
*
* Copyright 2012-2013 The MITRE Corporation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* **************************************************************************
* NOTICE
* This software was produced for the U. S. Government under Contract No.
* W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
* Software and Noncommercial Computer Software Documentation Clause
* 252.227-7014 (JUN 1995)
*
* (c) 2012 The MITRE Corporation. All Rights Reserved.
* **************************************************************************
*/
package org.opensextant.extractors.xcoord;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.opensextant.geodesy.MGRS;
import org.opensextant.util.TextUtils;
/**
*
* @author ubaldino
*/
public class MGRSParser {
/**
* Given the match parse MGRS as best as can be done.
* TODO: provide level of confidence. Items that match MGRS scheme perfectly are more likely to be MGRS than those that
* are not perfect matches, e.g. typos, inadvertent text wrapping, whitespace etc.
*
* @param rawtext the rawtext
* @param _text text normalized, optionally
* @param elements matched groups within regex pattern
* @return array of possible MGRS interpretations.
*/
public static MGRS[] parseMGRS(String rawtext, String _text, Map<String, String> elements) {
// pad MGRS
// remove whitespace
// set MGRS
// set lat, lon
String text = null;
if (_text == null) {
text = TextUtils.delete_whitespace(rawtext);
} else {
text = _text;
}
// Filter out trivial DD DEG MM pattern.
// This may not be an issue -- how prevalent is the DD DEG MM DMS pattern?
// Trivial test: 44 DEG 34 is not an MGRS pattern.
if (text.length() < 6) {
// less than 6 chars long this is either a zone with no offset
// or some sort of false positive. Pattern should not match this
return null;
}
if (text.length() < 8) {
String _test = text.substring(2, 5);
if (_test.equalsIgnoreCase("DEG")) {
return null;
}
}
// If we matched an obvious and invalid month
// as an MGRS, then fail early. Otherwise MGRSFilter
// will parse out more complex patterns that are date + time
// NOTE: an MGRS pattern may indeed look like a date+time in some cases but it
// can actually be a valid MGRS. Take care not to filter out too aggressively.
if (filterOutMonths(text)) {
return null;
}
String gzd = elements.get("MGRSZone");
/*
* Gridzone required.
*/
if (gzd == null) {
return null;
}
// GZD Rule: 00 not allowed in 5-digit GZD
// 0 not allowed in 4-digit
int num1 = parseInt(gzd.substring(0, 1));
int num2 = parseInt(gzd.substring(0, 2));
if (num2 == 0 || (num1 == 0 && gzd.length() == 2)) {
return null;
}
if (num1 < 0) {
// Pattern should have never matched.
return null;
}
// GZD Rule numbered zones not greate than 60
if (num2 > 60) {
return null;
}
//---------------------------------------|
//
// MGRS precision is 1m. Quad is 100,000m sq so resolution is 5 digits + 5 digits with optional whitespace
// 99999n 99999e -- in MGRS we never see "m" units or N/E denoted explicitly
// Occassionally, newlines or whitespace are interspersed in offset
// minimal:
// dd
// ddddd ddddd with an additional one or two white spaces. The offsets start and end with numbers. Only whitespace between is optional.
// ddddd dddddd additional digit in Easting -- trailing 6th digit is a typo; trim off
// dddddd ddddd additional digit in Northing -- trailing 6th digit is a typo; trim off
// ddddddddddd Typo introduces ambiguity -- only correct thing is to split on halfway point +/- 1 digit and emit two answers
// dd\nddd ddddd Newline early in offset
//---------------------------------------|
String ne = elements.get("Easting_Northing");
int digits = TextUtils.count_digits(ne);
boolean odd_len = ((digits & 0x0001) == 1);
if (!isValidEastingNorthing(ne, odd_len)) {
return null;
}
if (!odd_len) {
//----------------------------
// Completely normal MGRS with even number of digits.
//
// By this point you should have passed in normalized coordinate text - no whitespace
//----------------------------
//
return new MGRS[] { new MGRS(text) };
} else {
//----------------------------
// Slightly obscure case that is possibly a typo or Easting/Northing disturbed.
//
// The following logic for parsing is predominantly related to managing typos and rare cases.
// < 5% of the instances seen fall into this category.
//
//----------------------------
int space_count = TextUtils.count_ws(ne);
String nenorm;
String Q = elements.get("MGRSQuad");
StringBuilder mgrs1 = null;
if (space_count == 0) {
nenorm = ne;
// ddddddddd odd number of digits, no spaces.
// answer 1: dddd ddddd ==> N=dddd0
// answer 2: ddddd dddd ==> E=dddd0
int midpoint = (nenorm.length() / 2);
mgrs1 = new StringBuilder(ne);
mgrs1.insert(midpoint, "0"); // N=dddd0, add 0
mgrs1.insert(0, Q);
mgrs1.insert(0, gzd);
StringBuilder mgrs2 = new StringBuilder(ne);
mgrs2.append("0"); // E=dddd0 add 0
mgrs2.insert(0, Q);
mgrs2.insert(0, gzd);
return new MGRS[] { new MGRS(mgrs1.toString()), new MGRS(mgrs2.toString()) };
}
nenorm = TextUtils.squeeze_whitespace(ne);
space_count = TextUtils.count_ws(nenorm);
int ws_index = nenorm.indexOf(" ");
int midpoint = (nenorm.length() / 2);
// Even Split -- meaning easting northing appear to be good. But one needs to be fixed.
// boolean even_split = Math.abs( midpoint - ws_index ) <= 1;
// Given one of
// dddd ddddd
// ddddd dddd
// dd ddddddd
// where whitespace is ' ' or '\n' or '\r', etc.
// GIVEN: dddd ddddd
if (space_count == 1 && (ws_index + 1) == midpoint) {
mgrs1 = new StringBuilder(nenorm);
// ANSWER: dddd0 ddddd
mgrs1.insert(ws_index, "0");
mgrs1.insert(0, Q);
mgrs1.insert(0, gzd);
// Just one answer:
return new MGRS[] { new MGRS(TextUtils.delete_whitespace(mgrs1.toString())) };
}
if (space_count == 1 && (ws_index == midpoint)) {
mgrs1 = new StringBuilder(nenorm);
// ANSWER: ddddd dddd0
mgrs1.append("0");
mgrs1.insert(0, Q);
mgrs1.insert(0, gzd);
return new MGRS[] { new MGRS(TextUtils.delete_whitespace(mgrs1.toString())) };
}
// Given
// ddd dd d
// ddddd ddd dd
// etc.
// You have a bunch of MGRS digits broken up by whitespace.
// This is really obscure case where formatting or content conversion
// or word processing interferred with the MGRS text.
//
// This is < 0.1% of the cases
//
nenorm = TextUtils.delete_whitespace(ne);
// ddddddddd odd number of digits, no spaces.
// answer 1: dddd ddddd ==> N=dddd0
// answer 2: ddddd dddd ==> E=dddd0
midpoint = (nenorm.length() / 2);
mgrs1 = new StringBuilder(nenorm);
mgrs1.insert(midpoint, "0"); // N=dddd0, add 0
mgrs1.insert(0, Q);
mgrs1.insert(0, gzd);
StringBuilder mgrs2 = new StringBuilder(nenorm);
mgrs2.append("0"); // E=dddd0 add 0
mgrs2.insert(0, Q);
mgrs2.insert(0, gzd);
return new MGRS[] { new MGRS(mgrs1.toString()), new MGRS(mgrs2.toString()) };
}
}
/**
* A hueuristic from looking at real data, real text artifacts - typos, line endings, whitespace wrapping, etc.
*
* Acceptable Northing/Eastings:
* dd dd
* dddd dddd
*
* typos: (odd number of digits; whitespace or not.)
* ddd dd
* ddddd
*
* Not valid:
*
* dd dd\nd odd digits and has line endings
*
* @param ne NE string, e.g,. 56789 01234
* @param oddLength if len is odd
* @return if easting/northing is valid
*/
protected static boolean isValidEastingNorthing(String ne, boolean oddLength) {
// PARSE RULE: ignore abnormal MGRS patterns with line endings in the match
//
// The MGRS easting/northing is messy and contains line endings.
// Abort. This is not likely an MGRS worth anything.
//
boolean containsEOL = (ne.contains("\n") || ne.contains("\r"));
boolean containsTAB = ne.contains("\t");
if (oddLength) {
return !(containsEOL || containsTAB);
}
int wsCount = TextUtils.count_ws(ne);
// NO:
// dd dd\ndd
// YES: normal text wrap on offset.
// dd\ndd
if (wsCount > 1 && containsEOL) {
return false;
}
if (wsCount > 2) {
return false;
}
return true;
}
/**
*
* @param x an integer string
* @return int for the string
*/
protected static int parseInt(String x) {
try {
return Integer.parseInt(x);
} catch (Exception e) {
return -1;
}
}
/**
* While date/month patterns match the MGRS format, there are certain months that are just too common
* to believe they are relevant MGRS patterns.
*
*/
private static final Set<String> ignoreMonths = new HashSet<String>();
static {
ignoreMonths.add("jan"); // Lat band that is mostly water; Southern Africa
ignoreMonths.add("feb"); // ditto; almost always water.
//ignoreMonths.add("mar"); // Valid Congo, Brazil.
ignoreMonths.add("apr"); // Invalid zone, first letter is C-X; Not likely to ever match
ignoreMonths.add("aug"); // ditto
// Other months, however have to be parsed. If they are dates
// AND runtime flags have MGRS Filters enabled, then dates will be filtered out usually.
//
}
/**
* Filter out well-known date patterns that are not valid MGRS;
* MGRS Filter may additionally parse out more patterns. But we generate an MGRS object here
* we can filter such things out ahead of time, avoiding the inevitable exception.
* @param t
* @return
*/
private static boolean filterOutMonths(String t) {
String raw = t.toLowerCase();
String t1 = raw.substring(2, 5);
if (ignoreMonths.contains(t1)) {
return true;
}
t1 = t.substring(1, 4);
if (ignoreMonths.contains(t1)) {
return true;
}
return false;
}
}