//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.common.structure;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.language.Text;
import uk.gov.dstl.baleen.types.structure.Aside;
import uk.gov.dstl.baleen.types.structure.Caption;
import uk.gov.dstl.baleen.types.structure.DefinitionDescription;
import uk.gov.dstl.baleen.types.structure.DefinitionItem;
import uk.gov.dstl.baleen.types.structure.Details;
import uk.gov.dstl.baleen.types.structure.Heading;
import uk.gov.dstl.baleen.types.structure.ListItem;
import uk.gov.dstl.baleen.types.structure.Paragraph;
import uk.gov.dstl.baleen.types.structure.Preformatted;
import uk.gov.dstl.baleen.types.structure.Quotation;
import uk.gov.dstl.baleen.types.structure.Structure;
import uk.gov.dstl.baleen.types.structure.Summary;
import uk.gov.dstl.baleen.types.structure.TableCell;
import uk.gov.dstl.baleen.uima.BaleenAnnotator;
import uk.gov.dstl.baleen.uima.utils.StructureUtil;
/**
* Converts selected Structure annotations to Text annotations.
*
* Rather than annotators need to deal with every time of structure type in order to get the right
* text form a document. This annotator maps selected (configurable) structural types to Text
* annotations.
*
* The list of structure types to map is Paragraph, Aside, Details, ListItem, TableCell, Summary,
* Quotation, Heading, Caption, DefinitionItem, DefinitionList, Preformatted.
*
* This list can be configured by providing class names (or full qualified classes) to the types
* field.
*
* This annotator ensures that no Text annotation overlap. If they did overlap then other annotator
* would process the same text (within two different text field) resulting in duplicate annotations.
* You can control the overlap removal by setting the keepSmallest parameter.
*
* NOTE: Test cases are in baleen-annotators
*
* @baleen.javadoc
*
*/
public class TextBlocks extends BaleenAnnotator {
private static final Set<Class<? extends Structure>> DEFAULT_STRUCTURAL_CLASSES = ImmutableSet.of(
Paragraph.class,
Aside.class,
Details.class,
ListItem.class,
TableCell.class,
Summary.class,
Quotation.class,
Heading.class,
Caption.class,
DefinitionItem.class,
DefinitionDescription.class,
Preformatted.class);
/**
* A list of structural types which will be mapped to TextBlocks.
*
* @baleen.config Paragraph,TableCell,ListItem,Aside, ...
*/
public static final String PARAM_TYPE_NAMES = "types";
@ConfigurationParameter(name = PARAM_TYPE_NAMES, mandatory = false)
private String[] typeNames;
/**
* In order to remove overlapping Text annotations we can either remove the annotation covering
* (biggest) or the annotations covered (smallest).
*
* We default to picking the smallest units of text.
*
* @baleen.config true
*/
public static final String PARAM_KEEP_SMALLEST = "keepSmallest";
@ConfigurationParameter(name = PARAM_KEEP_SMALLEST, defaultValue = "true")
private boolean keepSmallest;
private Set<Class<? extends Structure>> structuralClasses;
@Override
public void doInitialize(final UimaContext aContext) throws ResourceInitializationException {
super.doInitialize(aContext);
if (typeNames == null || typeNames.length == 0) {
structuralClasses = Sets.newHashSet(DEFAULT_STRUCTURAL_CLASSES);
} else {
structuralClasses = StructureUtil.getStructureClasses(typeNames);
}
}
@Override
protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException {
final Collection<Structure> structures = JCasUtil.select(jCas, Structure.class);
if (structures.isEmpty()) {
// If the jCas has no structural annotations then the entire text should be marked as a text
// block
final int end = jCas.getDocumentText().length();
final Text t = new Text(jCas, 0, end);
addToJCasIndex(t);
} else {
// Otherwise add the types we want...
structures.stream().filter(s -> structuralClasses.contains(s.getClass()))
.map(s -> new Text(jCas, s.getBegin(), s.getEnd())).forEach(this::addToJCasIndex);
// Now remove any that cover others, so we keep only biggest/most detailed as per request
final Map<Text, Collection<Text>> cover;
if (keepSmallest) {
cover = JCasUtil.indexCovering(jCas, Text.class, Text.class);
} else {
cover = JCasUtil.indexCovered(jCas, Text.class, Text.class);
}
cover.forEach((t, c) -> c.remove(t)); //Remove where x has been pulled out as covering itself (potential bug introduced in UIMAfit 2.3.0)
cover.values().stream().flatMap(Collection::stream).forEach(this::removeFromJCasIndex);
}
}
@SuppressWarnings("unchecked")
@Override
public AnalysisEngineAction getAction() {
Set<Class<? extends Annotation>> classes = new HashSet<>();
for(Class<?> c : structuralClasses){
classes.add((Class<? extends Annotation>) c);
}
return new AnalysisEngineAction(classes, Collections.emptySet());
}
}