//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.uima; import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.stream.Collectors; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import com.google.common.base.Joiner; import uk.gov.dstl.baleen.types.language.Text; import uk.gov.dstl.baleen.uima.data.TextBlock; /** * A helper class to deal with Text annotations within documents. * * For annotators which wish to work on each text annotation (text area) in order to process * independently the important content of the document, this annotator and the TextBlock class * provide a simplified approach. * * Implementations may choose to override the standard doProcess method, and then use the helper * methods (getTextInTextBlocks, getTextBlocks) or they may simply use the doProcessTextBlock in * order to iterate through each block in turn. * * If no text areas are present then the annotator defaults to the entire document (effectively * providing backwards compatibility). Even if text annotations are present the pipeline * configuration can still decide to use the whole document through the wholeDocument parameter. * * Note the value of getTextBlocks is not just a list of Text annotations, but an abstraction * called @link {@link TextBlock} which provides helper fucntions for managing the difference * between whole document and partial text annotations, and the implication of this for annotation * offsets. * * For clarity on annotation offset: UIMA requires an annotation's begin and end offset to be relative * to the document text. If you are working with text areas then the covered text is a subset of the * entire document. Thus you need to convert any offsets within the subset of text to document text * offset before creating annotations. TextBlock helps with this. * * @baleen.javadoc * */ public abstract class BaleenTextAwareAnnotator extends BaleenAnnotator { public static final String TEXT_BLOCK_SEPARATOR = "\n\n"; private static final Joiner TEXT_BLOCK_JOINER = Joiner.on(TEXT_BLOCK_SEPARATOR); /** * If true, then whole document will be treated as a single text block * * @baleen.config false */ public static final String PARAM_WHOLE_DOCUMENT = "wholeDocument"; @ConfigurationParameter(name = PARAM_WHOLE_DOCUMENT, defaultValue = "false") private boolean wholeDocumentAsText; @Override protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException { final List<TextBlock> blocks = getTextBlocks(jCas); for (final TextBlock b : blocks) { doProcessTextBlock(b); } } /** * Process a text block. * * This is only called if doProcess is not overridden by a child class. * * The default implementation will do nothing (so children do not need to call super). * * @param block the block to process * @throws AnalysisEngineProcessException the analysis engine process exception */ protected void doProcessTextBlock(final TextBlock block) throws AnalysisEngineProcessException { // Do nothing } /** * Get the text areas within the document. * * Provide a list of text blocks, representing the text annotations within the document. * * @param jCas * @return list of text boxes (non-null, maybe a singleton though) */ protected List<TextBlock> getTextBlocks(final JCas jCas) { if (!wholeDocumentAsText) { final Collection<Text> collection = JCasUtil.select(jCas, Text.class); // If there are no text blocks, then treat do the whole document looking for something. // This is effectively legacy compatibility, they will have no structural or text annotations // therefore this preserves the functionality of existing pipelines. // TODO: Perhaps this should be configurable as a parameter? if (!collection.isEmpty()) { return JCasUtil.select(jCas, Text.class).stream() .map(t -> new TextBlock(jCas, t)) .collect(Collectors.toList()); } } // Doesn't matter what we have here we create a new Text return Collections.singletonList(new TextBlock(jCas)); } /** * Gets the text in all text blocks. * * This will be separated by the constant TEXT_BLOCK_SEPARATOR (but that same pattern may occur * naturally in the document). * * Note that offsets within this make no sense for creation of annotations. If you wish to create * annotations you should use either TextBlock (which have relative offsets) or the Document text * which has an absolute offset. * * This is really for annotations which need to read the text, but not create annotations based on * it. For example keyword extraction. * * @param jCas the jcas * @return the combined text in text blocks */ protected String getTextInTextBlocks(final JCas jCas) { final List<TextBlock> blocks = getTextBlocks(jCas); if (blocks.isEmpty()) { // If it's empty save ourselves work return ""; } else if (blocks.size() == 1 && blocks.get(0).isWholeDocument()) { // If the text block is the document, then save creating new large strings return jCas.getDocumentText(); } else { return TEXT_BLOCK_JOINER.join(blocks.stream().map(TextBlock::getCoveredText).iterator()); } } /** * Allow child class to specifically override the wholeDocument parameter. * * Suggest to call in doInitialise (after call to super.doIniitalise). This allows a child to * enforce the type of processing if only one type is sensible (typically forcing whole document * mode). * * @param wholeDocument the new whole document as text */ protected void setWholeDocumentAsText(final boolean wholeDocument) { this.wholeDocumentAsText = wholeDocument; } /** * Checks if is whole document mode enabled * * @return true, if enabled */ protected boolean isWholeDocumentAsText() { return wholeDocumentAsText; } }