/*
Copyright 2009-2013 The MITRE Corporation.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
* **************************************************************************
* NOTICE
* This software was produced for the U. S. Government under Contract No.
* W15P7T-12-C-F600, and is subject to the Rights in Noncommercial Computer
* Software and Noncommercial Computer Software Documentation Clause
* 252.227-7014 (JUN 1995)
*
* (c) 2012 The MITRE Corporation. All Rights Reserved.
* **************************************************************************
**/
package org.opensextant.toolbox;
import java.net.URL;
import java.util.List;
import org.opensextant.regex.RegexAnnotation;
import org.opensextant.regex.RegexMatcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import gate.AnnotationSet;
import gate.Factory;
import gate.FeatureMap;
import gate.ProcessingResource;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.InvalidOffsetException;
@CreoleResource(name = "OpenSextant Regex Finder", comment = "A simple plugin that finds and normalizes entities "
+ "based on Java regular expresssions")
public class RegexFinderPR extends AbstractLanguageAnalyser implements ProcessingResource {
private static final long serialVersionUID = 1375472181851584128L;
/** The Regexmatcher object which does all of the work. */
private transient RegexMatcher reger;
/** The annotationSet into which the dates will be written. */
private String outputAnnotationSet;
/** The file containing the patterns. */
private URL patternFile;
/** The log. */
private static final Logger LOGGER = LoggerFactory.getLogger(RegexFinderPR.class);
/**
* Initializes the DateFinderPR resource.
*/
private void initialize() {
// initialize the regex matcher
reger = new RegexMatcher(patternFile);
}
/** End initialize. */
@Override
public Resource init() throws ResourceInstantiationException {
initialize();
return this;
}
@Override
public void reInit() throws ResourceInstantiationException {
initialize();
}
@Override
public void execute() throws ExecutionException {
// get the annotation set into which we will place any annotations found
AnnotationSet annotSet = (outputAnnotationSet == null || "".equals(outputAnnotationSet))
? document.getAnnotations() : document.getAnnotations(outputAnnotationSet);
// get the text of the document
String text = getDocument().getContent().toString();
// find the matches via the regex matcher
List<RegexAnnotation> matches = reger.match(text);
// loop over all the results
for (RegexAnnotation a : matches) {
// fill in all the annotation features
FeatureMap feats = Factory.newFeatureMap();
feats.putAll(a.getFeatures());
// create the GATE annotation
try {
annotSet.add((long) a.getStart(), (long) a.getEnd(), a.getType(), feats);
} catch (InvalidOffsetException e) {
LOGGER.error("Invalid Offset exception when creating annotation", e);
}
}
}
public String getOutputAnnotationSet() {
return outputAnnotationSet;
}
@Optional
@RunTime
@CreoleParameter
public void setOutputAnnotationSet(String outputAnnotationSet) {
this.outputAnnotationSet = outputAnnotationSet;
}
public URL getPatternFile() {
return patternFile;
}
@CreoleParameter
public void setPatternFile(URL patternFile) {
this.patternFile = patternFile;
}
} // class DateFinderPR