package org.opensextant.regex.time; import java.util.Date; import java.util.HashMap; import java.util.Map; import java.util.regex.MatchResult; import org.joda.time.DateTime; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.opensextant.regex.Normalizer; import org.opensextant.regex.RegexAnnotation; import org.opensextant.regex.RegexRule; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class DateTimeNormalizer2 implements Normalizer { /** Enum representing the specificity of the date time reference. */ public enum TimeResolution { NONE, ESTIMATED, ERA, CENTURY, DECADE, YEAR, MONTH, WEEK, DAY, HOUR, MINUTE, SECOND, FRACTIONAL_SECOND } /** * Map containing all of the Joda defined formatting elements and their * corresponding TimeResolutions NOTE: any element names appearing in a * regex other than these will be ignored for normalization purposes */ private static Map<String, TimeResolution> jodaNames = new HashMap<String, TimeResolution>(); static { jodaNames.put("G", TimeResolution.ERA); jodaNames.put("YYYY", TimeResolution.YEAR); jodaNames.put("YY", TimeResolution.YEAR); jodaNames.put("E", TimeResolution.NONE); jodaNames.put("M", TimeResolution.MONTH); jodaNames.put("MM", TimeResolution.MONTH); jodaNames.put("MMM", TimeResolution.MONTH); jodaNames.put("MMMM", TimeResolution.MONTH); jodaNames.put("d", TimeResolution.DAY); jodaNames.put("dd", TimeResolution.DAY); jodaNames.put("a", TimeResolution.NONE); jodaNames.put("K", TimeResolution.HOUR); jodaNames.put("h", TimeResolution.HOUR); jodaNames.put("hh", TimeResolution.HOUR); jodaNames.put("H", TimeResolution.HOUR); jodaNames.put("HH", TimeResolution.HOUR); jodaNames.put("k", TimeResolution.HOUR); jodaNames.put("m", TimeResolution.MINUTE); jodaNames.put("mm", TimeResolution.MINUTE); jodaNames.put("s", TimeResolution.SECOND); jodaNames.put("ss", TimeResolution.SECOND); jodaNames.put("SS", TimeResolution.FRACTIONAL_SECOND); jodaNames.put("z", TimeResolution.NONE); } /** Log object. */ private static final Logger LOGGER = LoggerFactory.getLogger(DateTimeNormalizer2.class); @Override public void normalize(RegexAnnotation anno, RegexRule r, MatchResult matchResult) { if ("Date".equalsIgnoreCase(anno.getType())) { normalizeDate(anno, r, matchResult); } if ("Time".equalsIgnoreCase(anno.getType())) { normalizeTime(anno, r, matchResult); } if ("DayOfTheMonth".equalsIgnoreCase(anno.getType())) { normalizeDay(anno, r, matchResult); } } public void normalizeDate(RegexAnnotation anno, RegexRule r, MatchResult matchResult) { Map<String, Object> normalizedResults = anno.getFeatures(); Map<String, String> elementsFound = new HashMap<String, String>(); int numGroups = matchResult.groupCount(); for (int i = 0; i < numGroups + 1; i++) { String elemenValue = matchResult.group(i); String elemName = r.getElementMap().get(i); elementsFound.put(elemName, elemenValue); if (LOGGER.isDebugEnabled()) { normalizedResults.put(elemName, elemenValue); } } normalizedResults.put("ruleFamily", r.getRuleFamily()); normalizedResults.put("ruleName", r.getRuleName()); // create a reduced match and equivalent format string from each Joda // element found // the reduced match will contain only values relevant to normalization StringBuilder reducedMatch = new StringBuilder(); StringBuilder jodaPattern = new StringBuilder(); // look for the most precise element in pattern TimeResolution mostPrec = TimeResolution.NONE; boolean hasYear = false; for (String elem : elementsFound.keySet()) { String elemValue = elementsFound.get(elem); // clean up some specific conditions elemValue = cleanValues(elem, elemValue); // see if elem is a joda named elem if (jodaNames.keySet().contains(elem)) { // add elem value to reduced match reducedMatch.append(elemValue).append(" "); // add elem to pattern string jodaPattern.append(elem).append(" "); // get the resolution for this element TimeResolution tr = jodaNames.get(elem); // see if we have a year resolution if (TimeResolution.YEAR.equals(tr)) { hasYear = true; } // is this the most precise resolution seen? if (tr.compareTo(mostPrec) > 0) { mostPrec = jodaNames.get(elem); } } } // create a Joda formatter from the format string created above DateTimeFormatter fmt = null; try { fmt = DateTimeFormat.forPattern(jodaPattern.toString()); } catch (Exception e) { LOGGER.warn("Could not use format " + jodaPattern + " derived from " + anno.getMatchText() + " setting annotation as invalid", e); anno.setValid(false); return; } // check if year value missing if (!hasYear) { int estYear = getEstimatedYear(); fmt = fmt.withDefaultYear(estYear); LOGGER.debug("No year in pattern " + jodaPattern + " using:" + estYear); // set time resolution to ESTIMATED to indicate assumption made mostPrec = TimeResolution.ESTIMATED; } // parse the reduced match using the derived formatter DateTime dt = null; try { dt = fmt.parseDateTime(reducedMatch.toString()); LOGGER.debug("Parsing ->" + anno.getMatchText() + "<- reduced to ->" + reducedMatch + "<- as format ->" + jodaPattern + "<- got " + dt); } catch (Exception e) { LOGGER.warn("Cannot normalize " + anno.getMatchText() + " using " + r, e); anno.setValid(false); return; } // adjust precision for some specific cases // look for phrases like "the 1990's" or "the 1800s" if ("YEAR".equalsIgnoreCase(r.getRuleFamily())) { // get phrase, remove apostrophes and tics String phrase = anno.getMatchText().trim().replaceAll("['`]", ""); if (phrase.endsWith("0s")) { mostPrec = TimeResolution.DECADE; } if (phrase.endsWith("00s")) { mostPrec = TimeResolution.CENTURY; } } // create a JDK Date to return Date jdkDate = dt.toDate(); normalizedResults.put("date", jdkDate); normalizedResults.put("precision", mostPrec); return; } private void normalizeTime(RegexAnnotation anno, RegexRule r, MatchResult matchResult) { Map<String, Object> normalizedResults = anno.getFeatures(); Map<String, String> elementsFound = new HashMap<String, String>(); int numGroups = matchResult.groupCount(); for (int i = 0; i < numGroups + 1; i++) { String elemenValue = matchResult.group(i); String elemName = r.getElementMap().get(i); elementsFound.put(elemName, elemenValue); if (LOGGER.isDebugEnabled()) { normalizedResults.put(elemName, elemenValue); } } normalizedResults.put("ruleFamily", r.getRuleFamily()); normalizedResults.put("ruleName", r.getRuleName()); normalizedResults.put("precision", TimeResolution.NONE); return; } private void normalizeDay(RegexAnnotation anno, RegexRule r, MatchResult matchResult) { Map<String, Object> normalizedResults = anno.getFeatures(); Map<String, String> elementsFound = new HashMap<String, String>(); int numGroups = matchResult.groupCount(); for (int i = 0; i < numGroups + 1; i++) { String elemenValue = matchResult.group(i); String elemName = r.getElementMap().get(i); elementsFound.put(elemName, elemenValue); if (LOGGER.isDebugEnabled()) { normalizedResults.put(elemName, elemenValue); } } normalizedResults.put("ruleFamily", r.getRuleFamily()); normalizedResults.put("ruleName", r.getRuleName()); normalizedResults.put("precision", TimeResolution.NONE); return; } /** * TODO decide on value for missing year if the pattern does not include * some form of "YEAR", use current year?, Joda default year (2000),? year * 0000? get a year value when none has been found */ private int getEstimatedYear() { DateTime now = new DateTime(); return now.getYear(); } /** Some hackery to convert some values to those that Joda recognizes. */ private String cleanValues(String elemName, String elemValue) { String cleanValue = elemValue; // strip trailing periods on abbreviated months and add "SEPT" as valid // abbrev if ("MMM".equals(elemName)) { cleanValue = cleanValue.replaceFirst("\\.$", "").replaceFirst("(?i:sept)", "Sep"); } // strip leading apostrophe/tic from abbreviated year if ("YY".equals(elemName)) { cleanValue = cleanValue.replaceFirst("^['`]", ""); } // strip trailing ordinals from days if ("dd".equals(elemName)) { cleanValue = cleanValue.replaceFirst("(st|nd|rd|th|ST|ND|RD|TH)$", ""); } // strip periods from abbreviated eras if ("G".equals(elemName)) { cleanValue = cleanValue.replaceAll("\\.", "").toUpperCase(); } // convert Z,ZULU and UTC timezones to GMT if ("z".equals(elemName) && ("Z".equalsIgnoreCase(cleanValue) || "ZULU".equalsIgnoreCase(cleanValue) || "UTC".equalsIgnoreCase(cleanValue))) { cleanValue = "GMT"; } // strip periods from am/pm eras if ("a".equals(elemName)) { cleanValue = cleanValue.replaceAll("\\.", "").toUpperCase(); } return cleanValue; } }