/* NOTICE This software was produced for the U. S. Government under Contract No. W15P7T-11-C-F600, and is subject to the Rights in Noncommercial Computer Software and Noncommercial Computer Software Documentation Clause 252.227-7014 (JUN 1995) Copyright 2010 The MITRE Corporation. All Rights Reserved. */ package org.opensextant.toolbox; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import gate.Annotation; import gate.AnnotationSet; import gate.Controller; import gate.FeatureMap; import gate.ProcessingResource; import gate.Resource; import gate.creole.AbstractLanguageAnalyser; import gate.creole.ControllerAwarePR; import gate.creole.ExecutionException; import gate.creole.ResourceInstantiationException; import gate.creole.metadata.CreoleParameter; import gate.creole.metadata.CreoleResource; import gate.creole.metadata.RunTime; import gate.util.InvalidOffsetException; /** * This PR categorizes noun phrases by looking at the vocabulary and other * entities that they contain. * */ @CreoleResource(name = "OpenSextant Sequence Abstractor", comment = "Categorizes Annotations by examining" + " the vocabulary and entities they contain") public class ChunkCategorizerPR2 extends AbstractLanguageAnalyser implements ProcessingResource, ControllerAwarePR { private static final long serialVersionUID = 1L; /** The annotationSet into which the created annotations will be written. */ private String outputAnnotationSet; /** The name of the noun phrase annotation to categorize. */ String nounPhraseAnnoName; /** The feature name which identifies a vocabulary entity. */ String vocabFeatureName = "hierarchy"; /** What portion of the NounPhrase should be tagged as a derived entity? */ boolean markPhrase = true; /** Do corefercing for otherwise uncategorized annotations. */ boolean doCoref = true; /** Co-referencing mapping <word,category>. */ private transient Map<String, String> wordCatMap = new HashMap<String, String>(); /** Log object. */ private static final Logger LOGGER = LoggerFactory.getLogger(ChunkCategorizerPR2.class); private void initialize() { LOGGER.info("Initializing "); } /** Do the initialization. */ @Override public Resource init() throws ResourceInstantiationException { initialize(); return this; } /** Re-do the initialization. */ @Override public void reInit() throws ResourceInstantiationException { initialize(); } /** Do the work. */ @Override public void execute() throws ExecutionException { // get the annotation set into which we will place any annotations AnnotationSet annotSet = (outputAnnotationSet == null || "".equals(outputAnnotationSet)) ? document.getAnnotations() : document.getAnnotations(outputAnnotationSet); // get all of the noun phrase chunks annotations AnnotationSet npSet = document.getAnnotations().get(nounPhraseAnnoName); // get all of the vocabulary and simple entity annotations. // get all of the hierarchically tagged vocab Set<String> hierFeatureNameSet = new HashSet<String>(); hierFeatureNameSet.add("hierarchy"); AnnotationSet vocabSet = document.getAnnotations().get(null, hierFeatureNameSet); // get all of the previously tagged entities (has feature "isEntity") Set<String> entityFeatureNameSet = new HashSet<String>(); entityFeatureNameSet.add("isEntity"); AnnotationSet entitySet = document.getAnnotations().get(null, entityFeatureNameSet); // get all of the tokens AnnotationSet tokenSet = document.getAnnotations().get("Token"); // categorize all tokens based on the vocab and entities categorizeTokens(tokenSet, vocabSet, entitySet); // clear out the co-ref mapping wordCatMap.clear(); // do the work for (Annotation np : npSet) { // attach a category sequence to each noun phrase attachCategorySequence(np, tokenSet); // categorize the noun phases based on the category sequence categorize(np); // add the np and category info to co-reference map if (doCoref) { addToCorefMap(np); } } // categorize any noun phrase not handled by above by co-referencing to // already categorized noun phrases if (doCoref) { for (Annotation np : npSet) { coRef(np); } } // output any entities derived from the noun phrase for (Annotation np : npSet) { createDerivedEntities(np, annotSet); } } /** End execute. */ @Override public void controllerExecutionAborted(Controller arg0, Throwable arg1) throws ExecutionException { LOGGER.info("Chunker Categorizer aborted"); } @Override public void controllerExecutionFinished(Controller arg0) throws ExecutionException { LOGGER.info("Chunker Categorizer finished"); } @Override public void controllerExecutionStarted(Controller arg0) throws ExecutionException { LOGGER.info("Chunker Categorizer started"); } public String getAnnotationName() { return nounPhraseAnnoName; } @RunTime @CreoleParameter(defaultValue = "NounPhrase") public void setAnnotationName(String annotationName) { this.nounPhraseAnnoName = annotationName; } private void categorizeTokens(AnnotationSet tokenSet, AnnotationSet vocabSet, AnnotationSet entitySet) { // add a "Category" feature to all tokens, based on part of Speech, // vocab and Entities // thin out the hierarchical vocab String thinnedVocabName = "TEMP_thinnedVocab"; AnnotationSet thinnedVocabSet = thinAnnotations(vocabSet, thinnedVocabName); for (Annotation a : tokenSet) { Long start = a.getStartNode().getOffset(); Long end = a.getEndNode().getOffset(); FeatureMap tmpMap = a.getFeatures(); // first layer - Part of Speech already on Token tmpMap.put("Category", "P." + reducePOSTags((String) tmpMap.get("pos"))); // could add non hierarchical vocab here // second layer - type from any overlapping Vocab AnnotationSet vSet = thinnedVocabSet.get(start, end); if (!vSet.isEmpty()) { Annotation tmpVocab = vSet.iterator().next(); String tmpCatLabel = tmpVocab.getType(); String tmpCatHier = (String) tmpVocab.getFeatures().get("hierarchy"); tmpMap.put("Category", "V." + tmpCatLabel + "/" + tmpCatHier); } // third layer - type from any overlapping Entities AnnotationSet eSet = entitySet.get(start, end); if (!eSet.isEmpty()) { Annotation tmpEntity = eSet.iterator().next(); String tmpCatLabel = tmpEntity.getType(); String tmpCatHier = (String) tmpEntity.getFeatures().get("hierarchy"); tmpMap.put("Category", "E." + tmpCatLabel + "/" + tmpCatHier); } } // remove the temporary thinned vocab sets document.removeAnnotationSet(thinnedVocabName); } /** * Attache a CategorySequence,CategorySequence_Reduced and ProperSequence * features to NounPhrase. */ private void attachCategorySequence(Annotation np, AnnotationSet tokens) { Long start = np.getStartNode().getOffset(); Long end = np.getEndNode().getOffset(); AnnotationSet tokensInNP = tokens.get(start, end); List<Annotation> tokenList = gate.Utils.inDocumentOrder(tokensInNP); List<String> categorySequence = new ArrayList<String>(); List<String> properSequence = new ArrayList<String>(); String reducedCatSeq = ""; for (Annotation a : tokenList) { String tmpCat = (String) a.getFeatures().get("Category"); categorySequence.add(tmpCat); String redCat = tmpCat.split("\\.")[0]; if ("P".equals(redCat)) { if (tmpCat.startsWith("P.Proper")) { String tmpProper = gate.Utils.cleanStringFor(document, a); if (tmpProper.length() > 2) { properSequence.add(tmpProper); } reducedCatSeq = reducedCatSeq + "P"; } else { reducedCatSeq = reducedCatSeq + "x"; } } else { reducedCatSeq = reducedCatSeq + redCat; } } reducedCatSeq = reducedCatSeq.trim(); np.getFeatures().put("CategorySequence", categorySequence); np.getFeatures().put("CategorySequence_Reduced", reducedCatSeq); np.getFeatures().put("ProperSequence", properSequence); } /** * Categorize a nounPhrase based on its category sequence also populate the * coref mapping. */ private void categorize(Annotation np) { List<?> categories = (List<?>) np.getFeatures().get("CategorySequence"); String reducedCatSeq = (String) np.getFeatures().get("CategorySequence_Reduced"); String cat = ""; String type = ""; String hier = ""; int rule = -1; // Rule #0 - seq is all Entities and misc = already handled if (reducedCatSeq.matches("[Ex]+")) { rule = 0; } else { // Rule #1 - seq ends with vocab -> type = type of Vocab if (reducedCatSeq.endsWith("V")) { cat = (String) categories.get(categories.size() - 1); rule = 1; } // Rule #2 - seq ends with vocab and 1 Proper -> type = type of // Vocab if (reducedCatSeq.matches(".*VP$")) { cat = (String) categories.get(categories.size() - 2); rule = 2; } // Rule #3 - seq ends with vocab and 2 Propers - type = type of // Vocab if (reducedCatSeq.matches(".*VPP$")) { cat = (String) categories.get(categories.size() - 3); rule = 3; } // Rule #4 - seq ends with vocab and 3 Propers - type = type of // Vocab if (reducedCatSeq.matches(".*VPPP")) { cat = (String) categories.get(categories.size() - 4); rule = 4; } if (cat != null && cat.length() > 0) { String[] typePieces = cat.split("/"); // strip off the leading "V." type = typePieces[0].replaceFirst("^V\\.", ""); hier = typePieces[1]; } } np.getFeatures().put("CategorizationRule", rule); if (type != null && type.length() > 0) { np.getFeatures().put("EntityType", type); np.getFeatures().put("hierarchy", hier); } } /** Derive entities from the categorized nounphrase. */ private void createDerivedEntities(Annotation np, AnnotationSet as) { String entType = (String) np.getFeatures().get("EntityType"); if (entType != null && entType.length() > 0) { Long start = 0L; Long end = 0L; String str = ""; // if we are tagging the whole noun phrase as the entity if (markPhrase) { str = gate.Utils.cleanStringFor(document, np); start = np.getStartNode().getOffset(); end = np.getEndNode().getOffset(); } String hier = (String) np.getFeatures().get("hierarchy"); FeatureMap fm = gate.Factory.newFeatureMap(); fm.put("string", str); fm.put("hierarchy", hier); fm.put("EntityType", entType); fm.put("isEntity", true); try { as.add(start, end, entType, fm); } catch (InvalidOffsetException e) { LOGGER.error("Invalid Offset exception when creating Entity annotation", e); } } } /** Populate the co-referencing map from a categorized noun phrase. */ private void addToCorefMap(Annotation np) { String tmpType = (String) np.getFeatures().get("EntityType"); if (tmpType == null || tmpType.length() < 1) { return; } String tmpHier = (String) np.getFeatures().get("hierarchy"); List<?> propers = (List<?>) np.getFeatures().get("ProperSequence"); for (Object o : propers) { String wrd = (String) o; if (wrd.length() > 2 && tmpHier.startsWith("Person.name")) { wordCatMap.put(wrd.toLowerCase(), tmpType + "/" + tmpHier); } } } private void coRef(Annotation np) { String tmpType = (String) np.getFeatures().get("EntityType"); // only coref if not already categorized if (tmpType != null && tmpType.length() > 0) { return; } List<?> propers = (List<?>) np.getFeatures().get("ProperSequence"); String cat = ""; String type = ""; String hier = ""; // look for a previously tagged word for (Object o : propers) { String wrd = (String) o; if (wordCatMap.keySet().contains(wrd.toLowerCase())) { cat = wordCatMap.get(wrd.toLowerCase()); } } // if we have found a previously tagged word, use that category if (cat != null && cat.length() > 0) { String[] typePieces = cat.split("/"); // strip off the leading "V." type = typePieces[0].replaceFirst("^V\\.", ""); hier = typePieces[1]; np.getFeatures().put("CategorizationRule", 5); if (type != null && type.length() > 0) { np.getFeatures().put("EntityType", type); np.getFeatures().put("hierarchy", hier); } } } /** * Thin out the annotation set by removing any annotation which is * completely within but not identical (in length) to another. */ private AnnotationSet thinAnnotations(AnnotationSet annoSet, String setName) { List<Annotation> survivorList = new ArrayList<Annotation>(annoSet); for (Annotation currentAnno : annoSet) { // get all annotations that "cover" the current. AnnotationSet coverSet = gate.Utils.getCoveringAnnotations(annoSet, currentAnno); for (Annotation a : coverSet) { // if the current is smaller than something in the cover set // remove it from survivor list if (gate.Utils.length(currentAnno) < gate.Utils.length(a)) { survivorList.remove(currentAnno); } } } // add all of the survivors to the "Thinned" annotation set AnnotationSet thinnedSet = document.getAnnotations(setName); thinnedSet.addAll(survivorList); return thinnedSet; } /** Reduce the part of speech tags to just "Proper" and "x" (don't care). */ private String reducePOSTags(String tag) { if (tag == null) { return "x"; } if (tag.matches("NP.*")) { return "Proper"; } return "x"; } }