/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.uima.examples.cas; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.CasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.analysis_engine.annotator.AnnotatorConfigurationException; import org.apache.uima.analysis_engine.annotator.AnnotatorContext; import org.apache.uima.analysis_engine.annotator.AnnotatorInitializationException; import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException; import org.apache.uima.analysis_engine.annotator.TextAnnotator; import org.apache.uima.cas.CAS; import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.FSTypeConstraint; import org.apache.uima.cas.FeatureStructure; import org.apache.uima.cas.Type; import org.apache.uima.cas.TypeSystem; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.resource.ResourceAccessException; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.util.Level; /** * Annotator that find substrings of the input document that match regular expressions. * <p> * There are two ways to specify the regular expressions - via configuration parameters or via an * external resource file. * <p> * This annotator takes the following optional configuration parameters: * <ul> * <li><code>Patterns</code> - array of Strings indicating regular expressions to match. The * pattern language is described at <a * href="http://java.sun.com/j2se/1.4.2/docs/api/java/util/regex/Pattern.html"> * http://java.sun.com/j2se/1.4.2/docs/api/java/util/regex/Pattern.html</a>) </li> * <li><code>TypeNames</code> - array of Strings indicating names of Types to be created from the * patterns. </li> * <li><code>ContainingAnnotationTypes</code> - an array of input annotation types. This * annotator will only produce new annotations that are contained within existing annotaions of * these types. (This is optional.) </li> * <li><code>AnnotateEntireContainedAnnotation</code> - When the ContainingAnnoationTypes * parameter is specified, a value of true for this parameter will cause the entire containing * annotation to be used as the span of the new annotation, rather than just the span of the regular * expression match. This can be used to "classify" previously created annotations according to * whether or not they contain text matching a regular expression. </li> * </ul> * <p> * The indices of the <code>Patterns</code> and <code>TypeNames</code> arrays correspond, so * that a substring that matches <code>Patterns[i]</code> will result in an annotation of type * <code>TypeNames[i]</code>. * <p> * It is also possible to provide an external resource file that declares the annotation type names * and the regular expressions to match. The annotator will look for this file under the resource * key "PatternFile". The file format is as follows: * <ul> * <li>Lines starting with # or whitepsace are ignored</li> * <li>Lines starting with % indicate an annotation type</li> * <li>All other lines are regular expressions, using the same syntax described for the * <code>Patterns</code> configuration parameter.</li> * </ul> * If a regular expression is matched, it will be annotated with the last annotation type declared * (the nearest preceding line starting with %). * * */ public class RegExAnnotator extends CasAnnotator_ImplBase { public static final String MESSAGE_DIGEST = "org.apache.uima.examples.cas.RegExAnnotator_Messages"; /** * Performs any startup tasks required by this annotator. This implementation reads the * configuration parmaeters and compiles the regular expressions. * * @see TextAnnotator#initialize(AnnotatorContext) */ public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); try { // Retrieve configuration parameters String[] patternStrings = (String[]) getContext().getConfigParameterValue("Patterns"); String[] typeNames = (String[]) getContext().getConfigParameterValue("TypeNames"); mContainingAnnotationTypeNames = (String[]) getContext().getConfigParameterValue( "ContainingAnnotationTypes"); if (mContainingAnnotationTypeNames != null && mContainingAnnotationTypeNames.length > 0) { mAnnotateEntireContainingAnnotation = (Boolean) getContext().getConfigParameterValue( "AnnotateEntireContainingAnnotation"); } else { mAnnotateEntireContainingAnnotation = Boolean.FALSE; } // create an ArrayList of type names and an ArrayList of pattern arrays, // where the indexes of the two lists corespond so that the patterns // at patternArray[i] correspond to the annotation type at // mTypeNames[i]. mTypeNames = new ArrayList(); ArrayList patternArray = new ArrayList(); if (patternStrings != null) { if (typeNames == null || typeNames.length != patternStrings.length) { // throw exception - error message in external message digest throw new ResourceInitializationException(MESSAGE_DIGEST, "type_pattern_array_length_mismatch", new Object[0]); } mTypeNames.addAll(Arrays.asList(typeNames)); for (int i = 0; i < patternStrings.length; i++) { patternArray.add(new String[] { patternStrings[i] }); } } // if PatternFile resource exists, parse it and add to patternArray InputStream in = getContext().getResourceAsStream("PatternFile"); if (in != null) { try { ArrayList patternsForCurrentType = new ArrayList(); boolean foundFirstType = false; // get buffered reader BufferedReader reader = new BufferedReader(new InputStreamReader(in)); // read lines from file String line = reader.readLine(); while (line != null) { if (!line.startsWith("#") && line.length() > 0 && !Character.isWhitespace(line.charAt(0))) { // line is not a comment if (line.startsWith("%")) // annotation type name { // add pattern array for previous type (if any) to list if (foundFirstType) { String[] pats = new String[patternsForCurrentType.size()]; patternsForCurrentType.toArray(pats); patternArray.add(pats); patternsForCurrentType.clear(); } // add new type name to mTypeNames list mTypeNames.add(line.substring(1)); foundFirstType = true; } else // treat as regular expression { patternsForCurrentType.add(line); } } line = reader.readLine(); } // add last group of pattersn to patternArray String[] pats = new String[patternsForCurrentType.size()]; patternsForCurrentType.toArray(pats); patternArray.add(pats); } finally { if (in != null) { in.close(); } } } // make sure there is at least one pattern if (patternArray.isEmpty()) { throw new ResourceInitializationException( AnnotatorConfigurationException.ONE_PARAM_REQUIRED, new Object[] { "Patterns, Pattern File" }); } // compile regular expression patterns mPatterns = new Pattern[patternArray.size()][]; for (int i = 0; i < patternArray.size(); i++) { String[] pats = (String[]) patternArray.get(i); mPatterns[i] = new Pattern[pats.length]; for (int j = 0; j < mPatterns[i].length; j++) { try { mPatterns[i][j] = Pattern.compile(pats[j]); // make sure no pattern matches the empty string - as this // would lead to infinite loops during processing if (mPatterns[i][j].matcher("").matches()) { throw new ResourceInitializationException(MESSAGE_DIGEST, "regex_matches_empty_string", new Object[] { pats[j] }); } } catch (PatternSyntaxException e) { throw new ResourceInitializationException(MESSAGE_DIGEST, "regex_syntax_error", new Object[] { pats[j] }, e); } } } } catch (ResourceAccessException e) { throw new ResourceInitializationException(e); } catch (IOException e) { throw new ResourceInitializationException(e); } } /** * Acquires references to CAS Type and Feature objects that are later used during the * {@link #process(CAS)} method. * * @see TextAnnotator#typeSystemInit(TypeSystem) */ public void typeSystemInit(TypeSystem aTypeSystem) throws AnalysisEngineProcessException { // get references to annotation types we will create mCASTypes = new Type[mTypeNames.size()]; for (int i = 0; i < mTypeNames.size(); i++) { String curTypeName = (String) mTypeNames.get(i); mCASTypes[i] = aTypeSystem.getType(curTypeName); if (mCASTypes[i] == null) { throw new AnalysisEngineProcessException(AnnotatorInitializationException.TYPE_NOT_FOUND, new Object[] { this.getClass().getName(), curTypeName }); } } // get references to Containing Annotation Types if (mContainingAnnotationTypeNames == null) { mContainingAnnotationTypes = null; } else { mContainingAnnotationTypes = new Type[mContainingAnnotationTypeNames.length]; for (int i = 0; i < mContainingAnnotationTypes.length; i++) { mContainingAnnotationTypes[i] = aTypeSystem.getType(mContainingAnnotationTypeNames[i]); if (mContainingAnnotationTypes[i] == null) { throw new AnalysisEngineProcessException(AnnotatorInitializationException.TYPE_NOT_FOUND, new Object[] { getClass().getName(), mContainingAnnotationTypeNames[i] }); } } } } /** * Invokes this annotator's analysis logic. This annotator uses the java regular expression * package to find annotations using the regular expressions defined by its configuration * parameters. * * @param aCAS * the CAS to process * @param aResultSpec * A list of outputs that this annotator should produce. * * @throws AnnotatorProcessException * if a failure occurs during processing. * * @see CasAnnotator_ImplBase#process(CAS) */ public void process(CAS aCAS) throws AnalysisEngineProcessException { try { String docText = aCAS.getDocumentText(); // Determine which regions of the document we are going to annotate int[] rangesToAnnotate = getRangesToAnnotate(aCAS); // We treat the rangesToAnnotate array as a list of (start,end) offset // pairs. Iterate through all of these pairs. for (int i = 0; i < rangesToAnnotate.length; i += 2) { int startPos = rangesToAnnotate[i]; int endPos = rangesToAnnotate[i + 1]; // get the substring of text to be annotated String subText = docText.substring(startPos, endPos); // iterate over all annotation types for which we have patterns for (int j = 0; j < mCASTypes.length; j++) { // see if the ResultSpec contains this type if (getResultSpecification().containsType(mCASTypes[j].getName(),aCAS.getDocumentLanguage()) || getResultSpecification().containsType(mCASTypes[j].getName())) { // try to match each pattern that we have for this annotation type for (int k = 0; k < mPatterns[j].length; k++) { int pos = 0; Matcher matcher = mPatterns[j][k].matcher(subText); while (pos < subText.length() && matcher.find(pos)) { getContext().getLogger().log(Level.FINER, "RegEx match found: [" + matcher.group() + "]"); // match found; extract locations of start and end of match // (or of entire containing annotation, if that option is on) int annotStart, annotEnd; if (mAnnotateEntireContainingAnnotation.booleanValue()) { annotStart = startPos; annotEnd = endPos; } else { annotStart = startPos + matcher.start(); annotEnd = startPos + matcher.end(); } // create Annotation in CAS FeatureStructure fs = aCAS.createAnnotation(mCASTypes[j], annotStart, annotEnd); aCAS.getIndexRepository().addFS(fs); pos = annotEnd - startPos; } } } } } } catch (Exception e) { throw new AnalysisEngineProcessException(e); } } /** * Utility method that determines which subranges of the document text should be annotated by this * annotator. This is done as follows: * <ul> * <li>If <code>mContainingAnnotationTypes</code> is <code>null</code>, the entire document * is eligible for annotation.</li> * <li>If <code>mContainingAnnotationTypes</code> is not <code>null</code>, then each of its * elements is expected to be an Annotation Type name. The CAS is queried for existing annotations * of any of these Types, and the only subranges of the document eligible for annotation are those * subranges contained within such annotations.</li> * </ul> * * @param aCAS * CAS currently being processed * * @return an array of integers indicating the document subranges eligible for annotation. Begin * and end positions of the subranges are stored in successive elements of the array. For * example, elements 0 and 1 are the start and end of the first subrange; elements 2 and 3 * are the start and end of the second subrange, and so on. */ protected int[] getRangesToAnnotate(CAS aCAS) { if (mContainingAnnotationTypes == null || mContainingAnnotationTypes.length == 0) { // ContainingAnnotationTypes is not set - the whole document is eligible return new int[] { 0, aCAS.getDocumentText().length() }; } else { // get iterator over all annotations in the CAS FSIterator iterator = aCAS.getAnnotationIndex().iterator(); // filter the iterator so that only instances of Types in the // mContainingAnnotationTypes array are returned FSTypeConstraint constraint = aCAS.getConstraintFactory().createTypeConstraint(); for (int i = 0; i < mContainingAnnotationTypes.length; i++) { constraint.add(mContainingAnnotationTypes[i]); } iterator = aCAS.createFilteredIterator(iterator, constraint); // iterate over annotations and add them to an ArrayList List annotationList = new ArrayList(); while (iterator.isValid()) { annotationList.add(iterator.get()); iterator.moveToNext(); } // For each Annotation in the list, add its start and end // positions to the result array. int numRanges = annotationList.size(); int[] result = new int[numRanges * 2]; for (int j = 0; j < numRanges; j++) { AnnotationFS curFS = (AnnotationFS) annotationList.get(j); result[j * 2] = curFS.getBegin(); result[j * 2 + 1] = curFS.getEnd(); } return result; } } /** * The regular expression Patterns to be matched. */ private Pattern[][] mPatterns; /** * The names of the CAS types that this annotator produces from the patterns in {@link #mPatterns}. */ private ArrayList mTypeNames; /** * The names of the CAS types within which this annotator will search for new annotations. This * may be null, indicating that the entire document will be searched. */ private String[] mContainingAnnotationTypeNames; /** * The CAS types corresponding to {@link #mTypeNames}. */ private Type[] mCASTypes; /** * The CAS types corresponding to {@link #mContainingAnnotationTypeNames}. */ private Type[] mContainingAnnotationTypes; /** * Whether to annotate the entire span of the containing annotation when a match is found. */ private Boolean mAnnotateEntireContainingAnnotation; }