ChunkCategorizerPR2.java example

Explorer
OpenSextantToolbox-master
- src
  - org
    - opensextant
/*
                  NOTICE
This software was produced for the U. S. Government
under Contract No. W15P7T-11-C-F600, and is
subject to the Rights in Noncommercial Computer Software
and Noncommercial Computer Software Documentation
Clause 252.227-7014 (JUN 1995)
Copyright 2010 The MITRE Corporation. All Rights Reserved.
 */
package org.opensextant.toolbox;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Controller;
import gate.FeatureMap;
import gate.ProcessingResource;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ControllerAwarePR;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.RunTime;
import gate.util.InvalidOffsetException;

/**
 * This PR categorizes noun phrases by looking at the vocabulary and other
 * entities that they contain. *
 */
@CreoleResource(name = "OpenSextant Sequence Abstractor", comment = "Categorizes Annotations by examining"
		+ " the vocabulary and entities they contain")
public class ChunkCategorizerPR2 extends AbstractLanguageAnalyser implements ProcessingResource, ControllerAwarePR {

	private static final long serialVersionUID = 1L;

	/** The annotationSet into which the created annotations will be written. */
	private String outputAnnotationSet;

	/** The name of the noun phrase annotation to categorize. */
	String nounPhraseAnnoName;

	/** The feature name which identifies a vocabulary entity. */
	String vocabFeatureName = "hierarchy";

	/** What portion of the NounPhrase should be tagged as a derived entity? */
	boolean markPhrase = true;

	/** Do corefercing for otherwise uncategorized annotations. */
	boolean doCoref = true;

	/** Co-referencing mapping <word,category>. */
	private transient Map<String, String> wordCatMap = new HashMap<String, String>();

	/** Log object. */
	private static final Logger LOGGER = LoggerFactory.getLogger(ChunkCategorizerPR2.class);

	private void initialize() {
		LOGGER.info("Initializing ");
	}

	/** Do the initialization. */
	@Override
	public Resource init() throws ResourceInstantiationException {
		initialize();
		return this;
	}

	/** Re-do the initialization. */
	@Override
	public void reInit() throws ResourceInstantiationException {
		initialize();
	}

	/** Do the work. */
	@Override
	public void execute() throws ExecutionException {
		// get the annotation set into which we will place any annotations
		AnnotationSet annotSet = (outputAnnotationSet == null || "".equals(outputAnnotationSet))
				? document.getAnnotations() : document.getAnnotations(outputAnnotationSet);

		// get all of the noun phrase chunks annotations
		AnnotationSet npSet = document.getAnnotations().get(nounPhraseAnnoName);

		// get all of the vocabulary and simple entity annotations.

		// get all of the hierarchically tagged vocab
		Set<String> hierFeatureNameSet = new HashSet<String>();
		hierFeatureNameSet.add("hierarchy");
		AnnotationSet vocabSet = document.getAnnotations().get(null, hierFeatureNameSet);

		// get all of the previously tagged entities (has feature "isEntity")
		Set<String> entityFeatureNameSet = new HashSet<String>();
		entityFeatureNameSet.add("isEntity");
		AnnotationSet entitySet = document.getAnnotations().get(null, entityFeatureNameSet);

		// get all of the tokens
		AnnotationSet tokenSet = document.getAnnotations().get("Token");

		// categorize all tokens based on the vocab and entities
		categorizeTokens(tokenSet, vocabSet, entitySet);
		// clear out the co-ref mapping
		wordCatMap.clear();
		// do the work
		for (Annotation np : npSet) {
			// attach a category sequence to each noun phrase
			attachCategorySequence(np, tokenSet);
			// categorize the noun phases based on the category sequence
			categorize(np);
			// add the np and category info to co-reference map
			if (doCoref) {
				addToCorefMap(np);
			}
		}
		// categorize any noun phrase not handled by above by co-referencing to
		// already categorized noun phrases
		if (doCoref) {
			for (Annotation np : npSet) {
				coRef(np);
			}
		}
		// output any entities derived from the noun phrase
		for (Annotation np : npSet) {
			createDerivedEntities(np, annotSet);
		}
	}

	/** End execute. */

	@Override
	public void controllerExecutionAborted(Controller arg0, Throwable arg1) throws ExecutionException {
		LOGGER.info("Chunker Categorizer aborted");
	}

	@Override
	public void controllerExecutionFinished(Controller arg0) throws ExecutionException {
		LOGGER.info("Chunker Categorizer finished");
	}

	@Override
	public void controllerExecutionStarted(Controller arg0) throws ExecutionException {
		LOGGER.info("Chunker Categorizer started");
	}

	public String getAnnotationName() {
		return nounPhraseAnnoName;
	}

	@RunTime
	@CreoleParameter(defaultValue = "NounPhrase")
	public void setAnnotationName(String annotationName) {
		this.nounPhraseAnnoName = annotationName;
	}

	private void categorizeTokens(AnnotationSet tokenSet, AnnotationSet vocabSet, AnnotationSet entitySet) {
		// add a "Category" feature to all tokens, based on part of Speech,
		// vocab and Entities
		// thin out the hierarchical vocab
		String thinnedVocabName = "TEMP_thinnedVocab";
		AnnotationSet thinnedVocabSet = thinAnnotations(vocabSet, thinnedVocabName);

		for (Annotation a : tokenSet) {
			Long start = a.getStartNode().getOffset();
			Long end = a.getEndNode().getOffset();
			FeatureMap tmpMap = a.getFeatures();
			// first layer - Part of Speech already on Token
			tmpMap.put("Category", "P." + reducePOSTags((String) tmpMap.get("pos")));
			// could add non hierarchical vocab here
			// second layer - type from any overlapping Vocab
			AnnotationSet vSet = thinnedVocabSet.get(start, end);
			if (!vSet.isEmpty()) {
				Annotation tmpVocab = vSet.iterator().next();
				String tmpCatLabel = tmpVocab.getType();
				String tmpCatHier = (String) tmpVocab.getFeatures().get("hierarchy");
				tmpMap.put("Category", "V." + tmpCatLabel + "/" + tmpCatHier);
			}
			// third layer - type from any overlapping Entities
			AnnotationSet eSet = entitySet.get(start, end);
			if (!eSet.isEmpty()) {
				Annotation tmpEntity = eSet.iterator().next();
				String tmpCatLabel = tmpEntity.getType();
				String tmpCatHier = (String) tmpEntity.getFeatures().get("hierarchy");
				tmpMap.put("Category", "E." + tmpCatLabel + "/" + tmpCatHier);
			}
		}
		// remove the temporary thinned vocab sets
		document.removeAnnotationSet(thinnedVocabName);
	}

	/**
	 * Attache a CategorySequence,CategorySequence_Reduced and ProperSequence
	 * features to NounPhrase.
	 */
	private void attachCategorySequence(Annotation np, AnnotationSet tokens) {
		Long start = np.getStartNode().getOffset();
		Long end = np.getEndNode().getOffset();
		AnnotationSet tokensInNP = tokens.get(start, end);
		List<Annotation> tokenList = gate.Utils.inDocumentOrder(tokensInNP);
		List<String> categorySequence = new ArrayList<String>();
		List<String> properSequence = new ArrayList<String>();
		String reducedCatSeq = "";
		for (Annotation a : tokenList) {
			String tmpCat = (String) a.getFeatures().get("Category");
			categorySequence.add(tmpCat);
			String redCat = tmpCat.split("\\.")[0];
			if ("P".equals(redCat)) {
				if (tmpCat.startsWith("P.Proper")) {
					String tmpProper = gate.Utils.cleanStringFor(document, a);
					if (tmpProper.length() > 2) {
						properSequence.add(tmpProper);
					}
					reducedCatSeq = reducedCatSeq + "P";
				} else {
					reducedCatSeq = reducedCatSeq + "x";
				}
			} else {
				reducedCatSeq = reducedCatSeq + redCat;
			}
		}
		reducedCatSeq = reducedCatSeq.trim();
		np.getFeatures().put("CategorySequence", categorySequence);
		np.getFeatures().put("CategorySequence_Reduced", reducedCatSeq);
		np.getFeatures().put("ProperSequence", properSequence);
	}

	/**
	 * Categorize a nounPhrase based on its category sequence also populate the
	 * coref mapping.
	 */
	private void categorize(Annotation np) {
		List<?> categories = (List<?>) np.getFeatures().get("CategorySequence");
		String reducedCatSeq = (String) np.getFeatures().get("CategorySequence_Reduced");
		String cat = "";
		String type = "";
		String hier = "";

		int rule = -1;
		// Rule #0 - seq is all Entities and misc = already handled
		if (reducedCatSeq.matches("[Ex]+")) {
			rule = 0;
		} else {
			// Rule #1 - seq ends with vocab -> type = type of Vocab
			if (reducedCatSeq.endsWith("V")) {
				cat = (String) categories.get(categories.size() - 1);
				rule = 1;
			}
			// Rule #2 - seq ends with vocab and 1 Proper -> type = type of
			// Vocab
			if (reducedCatSeq.matches(".*VP$")) {
				cat = (String) categories.get(categories.size() - 2);
				rule = 2;
			}
			// Rule #3 - seq ends with vocab and 2 Propers - type = type of
			// Vocab
			if (reducedCatSeq.matches(".*VPP$")) {
				cat = (String) categories.get(categories.size() - 3);
				rule = 3;
			}
			// Rule #4 - seq ends with vocab and 3 Propers - type = type of
			// Vocab
			if (reducedCatSeq.matches(".*VPPP")) {
				cat = (String) categories.get(categories.size() - 4);
				rule = 4;
			}
			if (cat != null && cat.length() > 0) {
				String[] typePieces = cat.split("/");
				// strip off the leading "V."
				type = typePieces[0].replaceFirst("^V\\.", "");
				hier = typePieces[1];
			}
		}
		np.getFeatures().put("CategorizationRule", rule);
		if (type != null && type.length() > 0) {
			np.getFeatures().put("EntityType", type);
			np.getFeatures().put("hierarchy", hier);
		}
	}

	/** Derive entities from the categorized nounphrase. */
	private void createDerivedEntities(Annotation np, AnnotationSet as) {
		String entType = (String) np.getFeatures().get("EntityType");
		if (entType != null && entType.length() > 0) {
			Long start = 0L;
			Long end = 0L;
			String str = "";
			// if we are tagging the whole noun phrase as the entity
			if (markPhrase) {
				str = gate.Utils.cleanStringFor(document, np);
				start = np.getStartNode().getOffset();
				end = np.getEndNode().getOffset();
			}

			String hier = (String) np.getFeatures().get("hierarchy");
			FeatureMap fm = gate.Factory.newFeatureMap();
			fm.put("string", str);
			fm.put("hierarchy", hier);
			fm.put("EntityType", entType);
			fm.put("isEntity", true);
			try {
				as.add(start, end, entType, fm);
			} catch (InvalidOffsetException e) {
				LOGGER.error("Invalid Offset exception when creating Entity annotation", e);
			}
		}
	}

	/** Populate the co-referencing map from a categorized noun phrase. */
	private void addToCorefMap(Annotation np) {
		String tmpType = (String) np.getFeatures().get("EntityType");
		if (tmpType == null || tmpType.length() < 1) {
			return;
		}
		String tmpHier = (String) np.getFeatures().get("hierarchy");
		List<?> propers = (List<?>) np.getFeatures().get("ProperSequence");
		for (Object o : propers) {
			String wrd = (String) o;
			if (wrd.length() > 2 && tmpHier.startsWith("Person.name")) {
				wordCatMap.put(wrd.toLowerCase(), tmpType + "/" + tmpHier);
			}
		}
	}

	private void coRef(Annotation np) {
		String tmpType = (String) np.getFeatures().get("EntityType");
		// only coref if not already categorized
		if (tmpType != null && tmpType.length() > 0) {
			return;
		}
		List<?> propers = (List<?>) np.getFeatures().get("ProperSequence");
		String cat = "";
		String type = "";
		String hier = "";
		// look for a previously tagged word
		for (Object o : propers) {
			String wrd = (String) o;
			if (wordCatMap.keySet().contains(wrd.toLowerCase())) {
				cat = wordCatMap.get(wrd.toLowerCase());
			}
		}
		// if we have found a previously tagged word, use that category
		if (cat != null && cat.length() > 0) {
			String[] typePieces = cat.split("/");
			// strip off the leading "V."
			type = typePieces[0].replaceFirst("^V\\.", "");
			hier = typePieces[1];
			np.getFeatures().put("CategorizationRule", 5);
			if (type != null && type.length() > 0) {
				np.getFeatures().put("EntityType", type);
				np.getFeatures().put("hierarchy", hier);
			}
		}
	}

	/**
	 * Thin out the annotation set by removing any annotation which is
	 * completely within but not identical (in length) to another.
	 */
	private AnnotationSet thinAnnotations(AnnotationSet annoSet, String setName) {
		List<Annotation> survivorList = new ArrayList<Annotation>(annoSet);
		for (Annotation currentAnno : annoSet) {
			// get all annotations that "cover" the current.
			AnnotationSet coverSet = gate.Utils.getCoveringAnnotations(annoSet, currentAnno);
			for (Annotation a : coverSet) {
				// if the current is smaller than something in the cover set
				// remove it from survivor list
				if (gate.Utils.length(currentAnno) < gate.Utils.length(a)) {
					survivorList.remove(currentAnno);
				}
			}
		}
		// add all of the survivors to the "Thinned" annotation set
		AnnotationSet thinnedSet = document.getAnnotations(setName);
		thinnedSet.addAll(survivorList);
		return thinnedSet;
	}

	/** Reduce the part of speech tags to just "Proper" and "x" (don't care). */
	private String reducePOSTags(String tag) {
		if (tag == null) {
			return "x";
		}
		if (tag.matches("NP.*")) {
			return "Proper";
		}
		return "x";
	}
}