DateTimeNormalizer2.java example

Explorer
OpenSextantToolbox-master
- src
  - org
    - opensextant
package org.opensextant.regex.time;

import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.MatchResult;

import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.opensextant.regex.Normalizer;
import org.opensextant.regex.RegexAnnotation;
import org.opensextant.regex.RegexRule;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class DateTimeNormalizer2 implements Normalizer {

	/** Enum representing the specificity of the date time reference. */
	public enum TimeResolution {
		NONE, ESTIMATED, ERA, CENTURY, DECADE, YEAR, MONTH, WEEK, DAY, HOUR, MINUTE, SECOND, FRACTIONAL_SECOND
	}

	/**
	 * Map containing all of the Joda defined formatting elements and their
	 * corresponding TimeResolutions NOTE: any element names appearing in a
	 * regex other than these will be ignored for normalization purposes
	 */
	private static Map<String, TimeResolution> jodaNames = new HashMap<String, TimeResolution>();
	static {
		jodaNames.put("G", TimeResolution.ERA);
		jodaNames.put("YYYY", TimeResolution.YEAR);
		jodaNames.put("YY", TimeResolution.YEAR);
		jodaNames.put("E", TimeResolution.NONE);
		jodaNames.put("M", TimeResolution.MONTH);
		jodaNames.put("MM", TimeResolution.MONTH);
		jodaNames.put("MMM", TimeResolution.MONTH);
		jodaNames.put("MMMM", TimeResolution.MONTH);
		jodaNames.put("d", TimeResolution.DAY);
		jodaNames.put("dd", TimeResolution.DAY);
		jodaNames.put("a", TimeResolution.NONE);
		jodaNames.put("K", TimeResolution.HOUR);
		jodaNames.put("h", TimeResolution.HOUR);
		jodaNames.put("hh", TimeResolution.HOUR);
		jodaNames.put("H", TimeResolution.HOUR);
		jodaNames.put("HH", TimeResolution.HOUR);
		jodaNames.put("k", TimeResolution.HOUR);
		jodaNames.put("m", TimeResolution.MINUTE);
		jodaNames.put("mm", TimeResolution.MINUTE);
		jodaNames.put("s", TimeResolution.SECOND);
		jodaNames.put("ss", TimeResolution.SECOND);
		jodaNames.put("SS", TimeResolution.FRACTIONAL_SECOND);
		jodaNames.put("z", TimeResolution.NONE);
	}

	/** Log object. */
	private static final Logger LOGGER = LoggerFactory.getLogger(DateTimeNormalizer2.class);

	@Override
	public void normalize(RegexAnnotation anno, RegexRule r, MatchResult matchResult) {

		if ("Date".equalsIgnoreCase(anno.getType())) {
			normalizeDate(anno, r, matchResult);
		}

		if ("Time".equalsIgnoreCase(anno.getType())) {
			normalizeTime(anno, r, matchResult);
		}

		if ("DayOfTheMonth".equalsIgnoreCase(anno.getType())) {
			normalizeDay(anno, r, matchResult);
		}

	}

	public void normalizeDate(RegexAnnotation anno, RegexRule r, MatchResult matchResult) {

		Map<String, Object> normalizedResults = anno.getFeatures();
		Map<String, String> elementsFound = new HashMap<String, String>();
		int numGroups = matchResult.groupCount();
		for (int i = 0; i < numGroups + 1; i++) {
			String elemenValue = matchResult.group(i);
			String elemName = r.getElementMap().get(i);
			elementsFound.put(elemName, elemenValue);
			if (LOGGER.isDebugEnabled()) {
				normalizedResults.put(elemName, elemenValue);
			}
		}

		normalizedResults.put("ruleFamily", r.getRuleFamily());
		normalizedResults.put("ruleName", r.getRuleName());

		// create a reduced match and equivalent format string from each Joda
		// element found
		// the reduced match will contain only values relevant to normalization
		StringBuilder reducedMatch = new StringBuilder();
		StringBuilder jodaPattern = new StringBuilder();
		// look for the most precise element in pattern
		TimeResolution mostPrec = TimeResolution.NONE;
		boolean hasYear = false;
		for (String elem : elementsFound.keySet()) {
			String elemValue = elementsFound.get(elem);

			// clean up some specific conditions
			elemValue = cleanValues(elem, elemValue);

			// see if elem is a joda named elem
			if (jodaNames.keySet().contains(elem)) {
				// add elem value to reduced match
				reducedMatch.append(elemValue).append(" ");
				// add elem to pattern string
				jodaPattern.append(elem).append(" ");
				// get the resolution for this element
				TimeResolution tr = jodaNames.get(elem);

				// see if we have a year resolution
				if (TimeResolution.YEAR.equals(tr)) {
					hasYear = true;
				}

				// is this the most precise resolution seen?
				if (tr.compareTo(mostPrec) > 0) {
					mostPrec = jodaNames.get(elem);
				}
			}
		}

		// create a Joda formatter from the format string created above
		DateTimeFormatter fmt = null;

		try {
			fmt = DateTimeFormat.forPattern(jodaPattern.toString());
		} catch (Exception e) {
			LOGGER.warn("Could not use format " + jodaPattern + " derived from " + anno.getMatchText()
					+ " setting annotation as invalid", e);
			anno.setValid(false);
			return;
		}

		// check if year value missing
		if (!hasYear) {
			int estYear = getEstimatedYear();
			fmt = fmt.withDefaultYear(estYear);
			LOGGER.debug("No year in pattern " + jodaPattern + " using:" + estYear);
			// set time resolution to ESTIMATED to indicate assumption made
			mostPrec = TimeResolution.ESTIMATED;
		}

		// parse the reduced match using the derived formatter
		DateTime dt = null;
		try {
			dt = fmt.parseDateTime(reducedMatch.toString());
			LOGGER.debug("Parsing ->" + anno.getMatchText() + "<- reduced to ->" + reducedMatch + "<- as format ->"
					+ jodaPattern + "<- got " + dt);
		} catch (Exception e) {
			LOGGER.warn("Cannot normalize " + anno.getMatchText() + " using " + r, e);
			anno.setValid(false);
			return;
		}

		// adjust precision for some specific cases

		// look for phrases like "the 1990's" or "the 1800s"
		if ("YEAR".equalsIgnoreCase(r.getRuleFamily())) {
			// get phrase, remove apostrophes and tics
			String phrase = anno.getMatchText().trim().replaceAll("['`]", "");

			if (phrase.endsWith("0s")) {
				mostPrec = TimeResolution.DECADE;
			}
			if (phrase.endsWith("00s")) {
				mostPrec = TimeResolution.CENTURY;
			}

		}

		// create a JDK Date to return
		Date jdkDate = dt.toDate();

		normalizedResults.put("date", jdkDate);
		normalizedResults.put("precision", mostPrec);

		return;
	}

	private void normalizeTime(RegexAnnotation anno, RegexRule r, MatchResult matchResult) {
		Map<String, Object> normalizedResults = anno.getFeatures();
		Map<String, String> elementsFound = new HashMap<String, String>();
		int numGroups = matchResult.groupCount();
		for (int i = 0; i < numGroups + 1; i++) {
			String elemenValue = matchResult.group(i);
			String elemName = r.getElementMap().get(i);
			elementsFound.put(elemName, elemenValue);
			if (LOGGER.isDebugEnabled()) {
				normalizedResults.put(elemName, elemenValue);
			}
		}

		normalizedResults.put("ruleFamily", r.getRuleFamily());
		normalizedResults.put("ruleName", r.getRuleName());
		normalizedResults.put("precision", TimeResolution.NONE);
		return;
	}

	private void normalizeDay(RegexAnnotation anno, RegexRule r, MatchResult matchResult) {
		Map<String, Object> normalizedResults = anno.getFeatures();
		Map<String, String> elementsFound = new HashMap<String, String>();
		int numGroups = matchResult.groupCount();
		for (int i = 0; i < numGroups + 1; i++) {
			String elemenValue = matchResult.group(i);
			String elemName = r.getElementMap().get(i);
			elementsFound.put(elemName, elemenValue);
			if (LOGGER.isDebugEnabled()) {
				normalizedResults.put(elemName, elemenValue);
			}
		}

		normalizedResults.put("ruleFamily", r.getRuleFamily());
		normalizedResults.put("ruleName", r.getRuleName());
		normalizedResults.put("precision", TimeResolution.NONE);
		return;
	}

	/**
	 * TODO decide on value for missing year if the pattern does not include
	 * some form of "YEAR", use current year?, Joda default year (2000),? year
	 * 0000? get a year value when none has been found
	 */
	private int getEstimatedYear() {
		DateTime now = new DateTime();
		return now.getYear();
	}

	/** Some hackery to convert some values to those that Joda recognizes. */
	private String cleanValues(String elemName, String elemValue) {

		String cleanValue = elemValue;
		// strip trailing periods on abbreviated months and add "SEPT" as valid
		// abbrev
		if ("MMM".equals(elemName)) {
			cleanValue = cleanValue.replaceFirst("\\.$", "").replaceFirst("(?i:sept)", "Sep");
		}

		// strip leading apostrophe/tic from abbreviated year
		if ("YY".equals(elemName)) {
			cleanValue = cleanValue.replaceFirst("^['`]", "");
		}

		// strip trailing ordinals from days
		if ("dd".equals(elemName)) {
			cleanValue = cleanValue.replaceFirst("(st|nd|rd|th|ST|ND|RD|TH)$", "");
		}

		// strip periods from abbreviated eras
		if ("G".equals(elemName)) {
			cleanValue = cleanValue.replaceAll("\\.", "").toUpperCase();
		}

		// convert Z,ZULU and UTC timezones to GMT
		if ("z".equals(elemName) && ("Z".equalsIgnoreCase(cleanValue) || "ZULU".equalsIgnoreCase(cleanValue)
				|| "UTC".equalsIgnoreCase(cleanValue))) {
			cleanValue = "GMT";
		}

		// strip periods from am/pm eras
		if ("a".equals(elemName)) {
			cleanValue = cleanValue.replaceAll("\\.", "").toUpperCase();
		}

		return cleanValue;
	}

}