BaleenTextAwareAnnotator.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.uima;

import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;

import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;

import com.google.common.base.Joiner;

import uk.gov.dstl.baleen.types.language.Text;
import uk.gov.dstl.baleen.uima.data.TextBlock;

/**
 * A helper class to deal with Text annotations within documents.
 * 
 * For annotators which wish to work on each text annotation (text area) in order to process
 * independently the important content of the document, this annotator and the TextBlock class
 * provide a simplified approach.
 * 
 * Implementations may choose to override the standard doProcess method, and then use the helper
 * methods (getTextInTextBlocks, getTextBlocks) or they may simply use the doProcessTextBlock in
 * order to iterate through each block in turn.
 * 
 * If no text areas are present then the annotator defaults to the entire document (effectively
 * providing backwards compatibility). Even if text annotations are present the pipeline
 * configuration can still decide to use the whole document through the wholeDocument parameter.
 * 
 * Note the value of getTextBlocks is not just a list of Text annotations, but an abstraction
 * called @link {@link TextBlock} which provides helper fucntions for managing the difference
 * between whole document and partial text annotations, and the implication of this for annotation
 * offsets.
 * 
 * For clarity on annotation offset: UIMA requires an annotation's begin and end offset to be relative
 * to the document text. If you are working with text areas then the covered text is a subset of the
 * entire document. Thus you need to convert any offsets within the subset of text to document text
 * offset before creating annotations. TextBlock helps with this.
 * 
 * @baleen.javadoc
 *
 */
public abstract class BaleenTextAwareAnnotator extends BaleenAnnotator {

	public static final String TEXT_BLOCK_SEPARATOR = "\n\n";
	private static final Joiner TEXT_BLOCK_JOINER = Joiner.on(TEXT_BLOCK_SEPARATOR);


	/**
	 * If true, then whole document will be treated as a single text block
	 * 
	 * @baleen.config false
	 */
	public static final String PARAM_WHOLE_DOCUMENT = "wholeDocument";
	@ConfigurationParameter(name = PARAM_WHOLE_DOCUMENT, defaultValue = "false")
	private boolean wholeDocumentAsText;

	@Override
	protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {
		final List<TextBlock> blocks = getTextBlocks(jCas);

		for (final TextBlock b : blocks) {
			doProcessTextBlock(b);
		}
	}

	/**
	 * Process a text block.
	 * 
	 * This is only called if doProcess is not overridden by a child class.
	 * 
	 * The default implementation will do nothing (so children do not need to call super).
	 *
	 * @param block the block to process
	 * @throws AnalysisEngineProcessException the analysis engine process exception
	 */
	protected void doProcessTextBlock(final TextBlock block) throws AnalysisEngineProcessException {
		// Do nothing
	}

	/**
	 * Get the text areas within the document.
	 * 
	 * Provide a list of text blocks, representing the text annotations within the document.
	 * 
	 * @param jCas
	 * @return list of text boxes (non-null, maybe a singleton though)
	 */
	protected List<TextBlock> getTextBlocks(final JCas jCas) {
		if (!wholeDocumentAsText) {
			final Collection<Text> collection = JCasUtil.select(jCas, Text.class);

			// If there are no text blocks, then treat do the whole document looking for something.
			// This is effectively legacy compatibility, they will have no structural or text annotations
			// therefore this preserves the functionality of existing pipelines.
			// TODO: Perhaps this should be configurable as a parameter?

			if (!collection.isEmpty()) {
				return JCasUtil.select(jCas, Text.class).stream()
						.map(t -> new TextBlock(jCas, t))
						.collect(Collectors.toList());
			}
		}


		// Doesn't matter what we have here we create a new Text
		return Collections.singletonList(new TextBlock(jCas));

	}

	/**
	 * Gets the text in all text blocks.
	 * 
	 * This will be separated by the constant TEXT_BLOCK_SEPARATOR (but that same pattern may occur
	 * naturally in the document).
	 * 
	 * Note that offsets within this make no sense for creation of annotations. If you wish to create
	 * annotations you should use either TextBlock (which have relative offsets) or the Document text
	 * which has an absolute offset.
	 * 
	 * This is really for annotations which need to read the text, but not create annotations based on
	 * it. For example keyword extraction.
	 *
	 * @param jCas the jcas
	 * @return the combined text in text blocks
	 */
	protected String getTextInTextBlocks(final JCas jCas) {
		final List<TextBlock> blocks = getTextBlocks(jCas);

		if (blocks.isEmpty()) {
			// If it's empty save ourselves work
			return "";
		} else if (blocks.size() == 1 && blocks.get(0).isWholeDocument()) {
			// If the text block is the document, then save creating new large strings
			return jCas.getDocumentText();
		} else {
			return TEXT_BLOCK_JOINER.join(blocks.stream().map(TextBlock::getCoveredText).iterator());
		}
	}

	/**
	 * Allow child class to specifically override the wholeDocument parameter.
	 * 
	 * Suggest to call in doInitialise (after call to super.doIniitalise). This allows a child to
	 * enforce the type of processing if only one type is sensible (typically forcing whole document
	 * mode).
	 *
	 * @param wholeDocument the new whole document as text
	 */
	protected void setWholeDocumentAsText(final boolean wholeDocument) {
		this.wholeDocumentAsText = wholeDocument;
	}

	/**
	 * Checks if is whole document mode enabled
	 *
	 * @return true, if enabled
	 */
	protected boolean isWholeDocumentAsText() {
		return wholeDocumentAsText;
	}
}