/*
NOTICE
This software was produced for the U. S. Government
under Contract No. W15P7T-11-C-F600, and is
subject to the Rights in Noncommercial Computer Software
and Noncommercial Computer Software Documentation
Clause 252.227-7014 (JUN 1995)
Copyright 2010 The MITRE Corporation. All Rights Reserved.
*/
package org.opensextant.toolbox;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Controller;
import gate.FeatureMap;
import gate.ProcessingResource;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ControllerAwarePR;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.RunTime;
import gate.util.InvalidOffsetException;
/**
* This PR categorizes noun phrases by looking at the vocabulary and other
* entities that they contain. *
*/
@CreoleResource(name = "OpenSextant Sequence Abstractor", comment = "Categorizes Annotations by examining"
+ " the vocabulary and entities they contain")
public class ChunkCategorizerPR2 extends AbstractLanguageAnalyser implements ProcessingResource, ControllerAwarePR {
private static final long serialVersionUID = 1L;
/** The annotationSet into which the created annotations will be written. */
private String outputAnnotationSet;
/** The name of the noun phrase annotation to categorize. */
String nounPhraseAnnoName;
/** The feature name which identifies a vocabulary entity. */
String vocabFeatureName = "hierarchy";
/** What portion of the NounPhrase should be tagged as a derived entity? */
boolean markPhrase = true;
/** Do corefercing for otherwise uncategorized annotations. */
boolean doCoref = true;
/** Co-referencing mapping <word,category>. */
private transient Map<String, String> wordCatMap = new HashMap<String, String>();
/** Log object. */
private static final Logger LOGGER = LoggerFactory.getLogger(ChunkCategorizerPR2.class);
private void initialize() {
LOGGER.info("Initializing ");
}
/** Do the initialization. */
@Override
public Resource init() throws ResourceInstantiationException {
initialize();
return this;
}
/** Re-do the initialization. */
@Override
public void reInit() throws ResourceInstantiationException {
initialize();
}
/** Do the work. */
@Override
public void execute() throws ExecutionException {
// get the annotation set into which we will place any annotations
AnnotationSet annotSet = (outputAnnotationSet == null || "".equals(outputAnnotationSet))
? document.getAnnotations() : document.getAnnotations(outputAnnotationSet);
// get all of the noun phrase chunks annotations
AnnotationSet npSet = document.getAnnotations().get(nounPhraseAnnoName);
// get all of the vocabulary and simple entity annotations.
// get all of the hierarchically tagged vocab
Set<String> hierFeatureNameSet = new HashSet<String>();
hierFeatureNameSet.add("hierarchy");
AnnotationSet vocabSet = document.getAnnotations().get(null, hierFeatureNameSet);
// get all of the previously tagged entities (has feature "isEntity")
Set<String> entityFeatureNameSet = new HashSet<String>();
entityFeatureNameSet.add("isEntity");
AnnotationSet entitySet = document.getAnnotations().get(null, entityFeatureNameSet);
// get all of the tokens
AnnotationSet tokenSet = document.getAnnotations().get("Token");
// categorize all tokens based on the vocab and entities
categorizeTokens(tokenSet, vocabSet, entitySet);
// clear out the co-ref mapping
wordCatMap.clear();
// do the work
for (Annotation np : npSet) {
// attach a category sequence to each noun phrase
attachCategorySequence(np, tokenSet);
// categorize the noun phases based on the category sequence
categorize(np);
// add the np and category info to co-reference map
if (doCoref) {
addToCorefMap(np);
}
}
// categorize any noun phrase not handled by above by co-referencing to
// already categorized noun phrases
if (doCoref) {
for (Annotation np : npSet) {
coRef(np);
}
}
// output any entities derived from the noun phrase
for (Annotation np : npSet) {
createDerivedEntities(np, annotSet);
}
}
/** End execute. */
@Override
public void controllerExecutionAborted(Controller arg0, Throwable arg1) throws ExecutionException {
LOGGER.info("Chunker Categorizer aborted");
}
@Override
public void controllerExecutionFinished(Controller arg0) throws ExecutionException {
LOGGER.info("Chunker Categorizer finished");
}
@Override
public void controllerExecutionStarted(Controller arg0) throws ExecutionException {
LOGGER.info("Chunker Categorizer started");
}
public String getAnnotationName() {
return nounPhraseAnnoName;
}
@RunTime
@CreoleParameter(defaultValue = "NounPhrase")
public void setAnnotationName(String annotationName) {
this.nounPhraseAnnoName = annotationName;
}
private void categorizeTokens(AnnotationSet tokenSet, AnnotationSet vocabSet, AnnotationSet entitySet) {
// add a "Category" feature to all tokens, based on part of Speech,
// vocab and Entities
// thin out the hierarchical vocab
String thinnedVocabName = "TEMP_thinnedVocab";
AnnotationSet thinnedVocabSet = thinAnnotations(vocabSet, thinnedVocabName);
for (Annotation a : tokenSet) {
Long start = a.getStartNode().getOffset();
Long end = a.getEndNode().getOffset();
FeatureMap tmpMap = a.getFeatures();
// first layer - Part of Speech already on Token
tmpMap.put("Category", "P." + reducePOSTags((String) tmpMap.get("pos")));
// could add non hierarchical vocab here
// second layer - type from any overlapping Vocab
AnnotationSet vSet = thinnedVocabSet.get(start, end);
if (!vSet.isEmpty()) {
Annotation tmpVocab = vSet.iterator().next();
String tmpCatLabel = tmpVocab.getType();
String tmpCatHier = (String) tmpVocab.getFeatures().get("hierarchy");
tmpMap.put("Category", "V." + tmpCatLabel + "/" + tmpCatHier);
}
// third layer - type from any overlapping Entities
AnnotationSet eSet = entitySet.get(start, end);
if (!eSet.isEmpty()) {
Annotation tmpEntity = eSet.iterator().next();
String tmpCatLabel = tmpEntity.getType();
String tmpCatHier = (String) tmpEntity.getFeatures().get("hierarchy");
tmpMap.put("Category", "E." + tmpCatLabel + "/" + tmpCatHier);
}
}
// remove the temporary thinned vocab sets
document.removeAnnotationSet(thinnedVocabName);
}
/**
* Attache a CategorySequence,CategorySequence_Reduced and ProperSequence
* features to NounPhrase.
*/
private void attachCategorySequence(Annotation np, AnnotationSet tokens) {
Long start = np.getStartNode().getOffset();
Long end = np.getEndNode().getOffset();
AnnotationSet tokensInNP = tokens.get(start, end);
List<Annotation> tokenList = gate.Utils.inDocumentOrder(tokensInNP);
List<String> categorySequence = new ArrayList<String>();
List<String> properSequence = new ArrayList<String>();
String reducedCatSeq = "";
for (Annotation a : tokenList) {
String tmpCat = (String) a.getFeatures().get("Category");
categorySequence.add(tmpCat);
String redCat = tmpCat.split("\\.")[0];
if ("P".equals(redCat)) {
if (tmpCat.startsWith("P.Proper")) {
String tmpProper = gate.Utils.cleanStringFor(document, a);
if (tmpProper.length() > 2) {
properSequence.add(tmpProper);
}
reducedCatSeq = reducedCatSeq + "P";
} else {
reducedCatSeq = reducedCatSeq + "x";
}
} else {
reducedCatSeq = reducedCatSeq + redCat;
}
}
reducedCatSeq = reducedCatSeq.trim();
np.getFeatures().put("CategorySequence", categorySequence);
np.getFeatures().put("CategorySequence_Reduced", reducedCatSeq);
np.getFeatures().put("ProperSequence", properSequence);
}
/**
* Categorize a nounPhrase based on its category sequence also populate the
* coref mapping.
*/
private void categorize(Annotation np) {
List<?> categories = (List<?>) np.getFeatures().get("CategorySequence");
String reducedCatSeq = (String) np.getFeatures().get("CategorySequence_Reduced");
String cat = "";
String type = "";
String hier = "";
int rule = -1;
// Rule #0 - seq is all Entities and misc = already handled
if (reducedCatSeq.matches("[Ex]+")) {
rule = 0;
} else {
// Rule #1 - seq ends with vocab -> type = type of Vocab
if (reducedCatSeq.endsWith("V")) {
cat = (String) categories.get(categories.size() - 1);
rule = 1;
}
// Rule #2 - seq ends with vocab and 1 Proper -> type = type of
// Vocab
if (reducedCatSeq.matches(".*VP$")) {
cat = (String) categories.get(categories.size() - 2);
rule = 2;
}
// Rule #3 - seq ends with vocab and 2 Propers - type = type of
// Vocab
if (reducedCatSeq.matches(".*VPP$")) {
cat = (String) categories.get(categories.size() - 3);
rule = 3;
}
// Rule #4 - seq ends with vocab and 3 Propers - type = type of
// Vocab
if (reducedCatSeq.matches(".*VPPP")) {
cat = (String) categories.get(categories.size() - 4);
rule = 4;
}
if (cat != null && cat.length() > 0) {
String[] typePieces = cat.split("/");
// strip off the leading "V."
type = typePieces[0].replaceFirst("^V\\.", "");
hier = typePieces[1];
}
}
np.getFeatures().put("CategorizationRule", rule);
if (type != null && type.length() > 0) {
np.getFeatures().put("EntityType", type);
np.getFeatures().put("hierarchy", hier);
}
}
/** Derive entities from the categorized nounphrase. */
private void createDerivedEntities(Annotation np, AnnotationSet as) {
String entType = (String) np.getFeatures().get("EntityType");
if (entType != null && entType.length() > 0) {
Long start = 0L;
Long end = 0L;
String str = "";
// if we are tagging the whole noun phrase as the entity
if (markPhrase) {
str = gate.Utils.cleanStringFor(document, np);
start = np.getStartNode().getOffset();
end = np.getEndNode().getOffset();
}
String hier = (String) np.getFeatures().get("hierarchy");
FeatureMap fm = gate.Factory.newFeatureMap();
fm.put("string", str);
fm.put("hierarchy", hier);
fm.put("EntityType", entType);
fm.put("isEntity", true);
try {
as.add(start, end, entType, fm);
} catch (InvalidOffsetException e) {
LOGGER.error("Invalid Offset exception when creating Entity annotation", e);
}
}
}
/** Populate the co-referencing map from a categorized noun phrase. */
private void addToCorefMap(Annotation np) {
String tmpType = (String) np.getFeatures().get("EntityType");
if (tmpType == null || tmpType.length() < 1) {
return;
}
String tmpHier = (String) np.getFeatures().get("hierarchy");
List<?> propers = (List<?>) np.getFeatures().get("ProperSequence");
for (Object o : propers) {
String wrd = (String) o;
if (wrd.length() > 2 && tmpHier.startsWith("Person.name")) {
wordCatMap.put(wrd.toLowerCase(), tmpType + "/" + tmpHier);
}
}
}
private void coRef(Annotation np) {
String tmpType = (String) np.getFeatures().get("EntityType");
// only coref if not already categorized
if (tmpType != null && tmpType.length() > 0) {
return;
}
List<?> propers = (List<?>) np.getFeatures().get("ProperSequence");
String cat = "";
String type = "";
String hier = "";
// look for a previously tagged word
for (Object o : propers) {
String wrd = (String) o;
if (wordCatMap.keySet().contains(wrd.toLowerCase())) {
cat = wordCatMap.get(wrd.toLowerCase());
}
}
// if we have found a previously tagged word, use that category
if (cat != null && cat.length() > 0) {
String[] typePieces = cat.split("/");
// strip off the leading "V."
type = typePieces[0].replaceFirst("^V\\.", "");
hier = typePieces[1];
np.getFeatures().put("CategorizationRule", 5);
if (type != null && type.length() > 0) {
np.getFeatures().put("EntityType", type);
np.getFeatures().put("hierarchy", hier);
}
}
}
/**
* Thin out the annotation set by removing any annotation which is
* completely within but not identical (in length) to another.
*/
private AnnotationSet thinAnnotations(AnnotationSet annoSet, String setName) {
List<Annotation> survivorList = new ArrayList<Annotation>(annoSet);
for (Annotation currentAnno : annoSet) {
// get all annotations that "cover" the current.
AnnotationSet coverSet = gate.Utils.getCoveringAnnotations(annoSet, currentAnno);
for (Annotation a : coverSet) {
// if the current is smaller than something in the cover set
// remove it from survivor list
if (gate.Utils.length(currentAnno) < gate.Utils.length(a)) {
survivorList.remove(currentAnno);
}
}
}
// add all of the survivors to the "Thinned" annotation set
AnnotationSet thinnedSet = document.getAnnotations(setName);
thinnedSet.addAll(survivorList);
return thinnedSet;
}
/** Reduce the part of speech tags to just "Proper" and "x" (don't care). */
private String reducePOSTags(String tag) {
if (tag == null) {
return "x";
}
if (tag.matches("NP.*")) {
return "Proper";
}
return "x";
}
}