/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.uima.tutorial.ex3; import java.text.BreakIterator; import java.text.DateFormat; import java.text.NumberFormat; import java.text.ParsePosition; import java.util.Date; import java.util.Locale; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.ResultSpecification; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import org.apache.uima.tutorial.DateAnnot; import org.apache.uima.tutorial.DateTimeAnnot; import org.apache.uima.tutorial.TimeAnnot; import org.apache.uima.util.Level; /** * Simple Date/Time annotator. */ public class TutorialDateTime extends JCasAnnotator_ImplBase { static abstract class Maker { abstract Annotation newAnnotation(JCas jcas, int start, int end); } JCas jcas; String input; ParsePosition pp = new ParsePosition(0); // Static vars holding patterns, and function pointers // n:nn nn:nn followed optionally with AM or PM // .*? (any number of arbitrary chars, minimum, not greedy) // \b followed by a word boundary // [0-2]? followed by the optionally the first digit, a 0, 1, or 2 // \d:[0-6]\d followed by a digit and the colon char,and minutes // \s*?(AM|PM)? followed by optional white space (non greedy) and AM or PM static final Pattern hoursMinutesPattern = Pattern .compile("(?s)\\b([0-2]?\\d:[0-5]\\d\\s*(AM\\W|PM\\W|am\\W|pm\\W)?)"); // static final DateFormat dfTimeShort = DateFormat.getTimeInstance(DateFormat.SHORT, Locale.US); // .*? (any number of artibrary chars, non greedy // \b word boundary // [0-1]? optional first digit // \d digit of month // / // [0-3]? optional day of month 1st digit // \d // ((/[1-2]\d\d\d)|(/\d\d)|\s) // year is /nnnn or /nn or missing static final Pattern numericDatePattern = Pattern .compile("(?s)\\b([0-1]?\\d/[0-3]?\\d((/[1-2]\\d\\d\\d)|(/\\d\\d))?)\\W"); // not static, because DateFormat is not threadsafe final DateFormat dfDateShort = DateFormat.getDateInstance(DateFormat.SHORT, Locale.US); // .*? (any number of artibrary chars, non greedy // \b word boundary // [Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec] Month // \.? optional period // \s+ // [0-3]? optional day of month 1st digit // \d // (((,\s+)?[1-2]\d\d\d\W)|((,\s+)?\d\d\W)|\W) // year is /nnnn or /nn or missing static final String shortMonthNames = "(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec)"; static final Pattern mediumDatePattern = Pattern.compile("(?s)\\b(" + shortMonthNames + "\\.?\\s[0-3]?\\d(((,\\s+)?[1-2]\\d\\d\\d)|((,\\s+)?\\d\\d))?)\\W"); static final DateFormat dfDateMedium = DateFormat.getDateInstance(DateFormat.MEDIUM, Locale.US); // for long month names, exclude May since it is covered by short month names static final String longMonthNames = "(January|February|March|April|June|July|August|September|October|November|December)"; static final Pattern longDatePattern = Pattern.compile("(?s)\\b(" + longMonthNames + "\\s[0-3]?\\d(((,\\s+)?[1-2]\\d\\d\\d)|((,\\s+)?\\d\\d))?)\\W"); static final DateFormat dfDateLong = DateFormat.getDateInstance(DateFormat.LONG, Locale.US); static final NumberFormat numberFormat = NumberFormat.getInstance(Locale.US); // function pointers for new instances static final Maker dateAnnotationMaker = new Maker() { Annotation newAnnotation(JCas jcas, int start, int end) { return new DateAnnot(jcas, start, end); } }; static final Maker timeAnnotationMaker = new Maker() { Annotation newAnnotation(JCas jcas, int start, int end) { return new TimeAnnot(jcas, start, end); } }; static final String defaultYear = "2003"; private static boolean warningMessageShown = false; // PROCESS /** * The ResultSpecification controls what gets produced. For example, to only produce * DateAnnotations, change the descriptor for this component to specify it outputs only that type. */ public void process(JCas aJCas) { jcas = aJCas; input = jcas.getDocumentText(); // Create Annotations ResultSpecification resultSpec = getResultSpecification(); boolean timeWanted = resultSpec.containsType("org.apache.uima.tutorial.TimeAnnot", aJCas.getDocumentLanguage()); boolean dateWanted = resultSpec.containsType("org.apache.uima.tutorial.DateAnnot", aJCas.getDocumentLanguage()); if (timeWanted) makeAnnotations(timeAnnotationMaker, hoursMinutesPattern, dfTimeShort); if (dateWanted) { makeAnnotations(dateAnnotationMaker, numericDatePattern, dfDateShort); makeAnnotations(dateAnnotationMaker, mediumDatePattern, dfDateMedium); makeAnnotations(dateAnnotationMaker, longDatePattern, dfDateLong); } if (!timeWanted && !dateWanted && !warningMessageShown) { String m = String.format( "No output is being produced by the TutorialDateTime annotator because the Result Specification did not contain" + " a request for the type%n" + " org.apache.uima.tutorial.TimeAnnot nor%n" + " org.apache.uima.tutorial.DateAnnot" + " with the language '%s'%n" + " (Note: this message will only be shown once.)%n", aJCas.getDocumentLanguage()); System.err.println(m); getContext().getLogger().log(Level.WARNING, m); warningMessageShown = true; } } // HELPER METHODS void makeAnnotations(Maker m, BreakIterator b) { b.setText(input); for (int end = b.next(), start = b.first(); end != BreakIterator.DONE; start = end, end = b .next()) { // eliminate all-whitespace tokens boolean isWhitespace = true; for (int i = start; i < end; i++) { if (!Character.isWhitespace(input.charAt(i))) { isWhitespace = false; break; } } if (!isWhitespace) { m.newAnnotation(jcas, start, end).addToIndexes(); } } } void makeAnnotations(Maker m, Pattern pattern, DateFormat dateFormat) { Matcher matcher = pattern.matcher(input); String matched; while (matcher.find()) { int start = matcher.start(1); matched = fixUpDateTimeStrings(matcher.group(1)); DateTimeAnnot dtAnnot = (DateTimeAnnot) m.newAnnotation(jcas, start, matcher.end(1)); pp.setIndex(0); Date dtSpec = dateFormat.parse(matched, pp); // System.out.println(dtAnnot.dtSpec); if (dtSpec != null) { dtAnnot.setShortDateString(dfDateShort.format(dtSpec)); } dtAnnot.addToIndexes(); } } String fixUpDateTimeStrings(String s) { String av; // append value pp.setIndex(0); if (-1 < s.indexOf(":")) { // have time string if (s.endsWith("AM") || s.endsWith("PM") || s.endsWith("am") || s.endsWith("pm")) return s; else { int hour = numberFormat.parse(s, pp).intValue(); if (0 == hour) av = " AM"; else if (hour < 9) av = " PM"; else av = " AM"; return s + av; } } // have date string return s + ", " + defaultYear; // in case no year available } }