//Dstl (c) Crown Copyright 2017
// Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.contentextractors;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.uima.UimaContext;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Strings;
import com.google.common.collect.Lists;
import io.committed.krill.extraction.Extraction;
import io.committed.krill.extraction.exception.ExtractionException;
import io.committed.krill.extraction.tika.TikaFormatExtractor;
import uk.gov.dstl.baleen.common.structure.TextBlocks;
import uk.gov.dstl.baleen.contentextractors.helpers.AbstractContentExtractor;
import uk.gov.dstl.baleen.contentextractors.helpers.DocumentToJCasConverter;
import uk.gov.dstl.baleen.contentmanipulators.helpers.ContentManipulator;
import uk.gov.dstl.baleen.contentmappers.MetaTags;
import uk.gov.dstl.baleen.contentmappers.StructuralAnnotations;
import uk.gov.dstl.baleen.contentmappers.helpers.ContentMapper;
import uk.gov.dstl.baleen.core.utils.BaleenDefaults;
import uk.gov.dstl.baleen.core.utils.BuilderUtils;
import uk.gov.dstl.baleen.exceptions.InvalidParameterException;
import uk.gov.dstl.baleen.types.structure.Structure;
/**
* Extracts metadata, structural annotations and text content from the supplied input.
*
* Structural annotations are as defined under the Baleen type system with top level
* {@link Structure} class.
*
* Structural extraction allows better understanding of the document by downstream annotators which
* can use the information to segregate the document, rather than treating it as a whole. For
* example, consider using a regex for each cell in a table which is different to considering the
* flat text version of the entire table.
*
* The process of structural content extraction is as follows:
*
* <ul>
* <li>The document is parsed and converted to a rich HTML representation. This is a general 'per
* document format' conversion.
* <li>A set of content manipulators act on the HTML which are configured for this Baleen pipeline.
* These can do anything (add new nodes, remove or amend text, etc). They might be used to clean up
* the HTML or to remove elements which aren't required by the pipeline.
* <li>A set of content mappers convert the the HTML nodes into annotations. They may create
* structural elements, or other types such as metadata or entities. The set of content mappers is
* configurable per pipeline.
* <li>The text of the document is extracted. Note that the content mappers can not change the text
* output, if you wish to change the text output then use a content manipulator.
* </ul>
*
* Note that content mapper and content manipulators can work in isolation or in coordination. By
* coordination we mean that a content manipulator might find the most likely title in a document,
* and mark it via introduction of a new HTML span element with a class title. A special content
* mapper could then look for this span and add the title as metadata.
*
* To configure content mappers and manipulators, and to use the structural content extractor,
* define your collection reader as follows.
*
* <pre>
* collectionreader:
* class: FolderReader
* contentExtractor: StructureContentExtractor
* extractTextBlocks: true
* contentManipulators:
* - RemoveEmptyText
* contentMappers:
* - SemanticHtml
* - MetaTags
* folders:
* - ./input
* </pre>
*
* If you do not include contentManipulators then none will be used. If you omit the contentMappers
* then the default StructuralAnnotations and MetaTags mapper will be used.
*
* The default value of extractTextBlocks is true. This means that the TextBlocks annotation will be
* run immediately. If you do not which to run this annotator then set the value to false. Running
* by default since otherwise the structural annotations extracted here are ignored by the rest of
* the pipeline. Pipeline developers may wish to disable this so they can configure the TextBlock
* annotator specifically.
*
* Note that structured extraction will only work (or be beneficial) on certain document types such
* as DOC, DOCX, PPT/X, XLS/X, PDF and HTML.
*
*/
public class StructureContentExtractor extends AbstractContentExtractor {
public static final String FIELD_CONTENT_MAPPERS = "contentMappers";
public static final String FIELD_CONTENT_MANIPULATORS = "contentManipulators";
public static final String FIELD_EXTRACT_TEXT_BLOCKS = "extractTextBlocks";
/** The Constant LOGGER. */
private static final Logger LOGGER = LoggerFactory.getLogger(StructureContentExtractor.class);
public static final String CORRUPT_FILE_TEXT = "FILE CONTENTS CORRUPT - UNABLE TO PROCESS";
private static final String METADATA_CONTENT_MANIPULATORS = "baleen:content-manipulators";
private static final String METADATA_CONTENT_MAPPERS = "baleen:content-mappers";
private List<String> contentManipulatorClasses;
private List<String> contentMapperClasses;
private List<ContentManipulator> manipulators = Collections.emptyList();
private DocumentToJCasConverter documentConverter;
private TikaFormatExtractor formatExtractor;
private TextBlocks textBlocks = null;
private List<ContentMapper> mappers = Collections.emptyList();
@Override
public void doInitialize(UimaContext context, Map<String, Object> params) throws ResourceInitializationException {
super.doInitialize(context, params);
Object manipulatorConfig = params.get(FIELD_CONTENT_MANIPULATORS);
if (manipulatorConfig != null && manipulatorConfig instanceof String[]) {
try {
manipulators = createContentProcessor(ContentManipulator.class, BaleenDefaults.DEFAULT_CONTENT_MANIPULATOR_PACKAGE, (String[]) manipulatorConfig);
// Initialise the manipulators
initialiseManipulators(context);
} catch (InvalidParameterException e) {
throw new ResourceInitializationException(e);
}
}
Object mapperConfig = params.get(FIELD_CONTENT_MAPPERS);
if (mapperConfig != null && mapperConfig instanceof String[]) {
try {
mappers = createContentProcessor(ContentMapper.class, BaleenDefaults.DEFAULT_CONTENT_MAPPER_PACKAGE, (String[]) mapperConfig);
// Initialise the mappers
initialiseMappers(context);
} catch (InvalidParameterException e) {
throw new ResourceInitializationException(e);
}
} else {
// Defaults to extraction of the Structural Annotations only
mappers = Lists.newArrayList(new StructuralAnnotations(), new MetaTags());
}
contentManipulatorClasses =
manipulators.stream().map(m -> m.getClass().getName()).collect(Collectors.toList());
contentMapperClasses =
mappers.stream().map(m -> m.getClass().getName()).collect(Collectors.toList());
documentConverter = new DocumentToJCasConverter(mappers);
formatExtractor = new TikaFormatExtractor();
// Run the text block annotator after the configuration
Object extractTextBlockConfig = params.get(FIELD_EXTRACT_TEXT_BLOCKS);
boolean runTextBlocks = true;
if (extractTextBlockConfig != null) {
if (extractTextBlockConfig instanceof String) {
if ("false".equalsIgnoreCase((String) extractTextBlockConfig)
|| "no".equalsIgnoreCase((String) extractTextBlockConfig)) {
runTextBlocks = false;
}
} else if (extractTextBlockConfig instanceof Boolean) {
runTextBlocks = (Boolean) extractTextBlockConfig;
}
}
if (runTextBlocks) {
textBlocks = new TextBlocks();
textBlocks.initialize(context);
}
}
private void initialiseMappers(UimaContext context){
mappers.forEach(m -> {
try {
m.initialize(context);
} catch (Exception e) {
getMonitor().warn("Unable to initialse content mapper: {}", e);
}
});
}
private void initialiseManipulators(UimaContext context){
manipulators.forEach(m -> {
try {
m.initialize(context);
} catch (Exception e) {
getMonitor().warn("Unable to initialse content manipulator: {}", e);
}
});
}
/**
* Creates the content processor (ie a mapper or a manipulator).
*
* @param <T> the generic type
* @param clazz the clazz (of T)
* @param defaultPackage the default package to look in
* @param context the context
* @param classes the classes
* @return the list
* @throws InvalidParameterException the invalid parameter exception
*/
// Note this is checked by clazz isInstance
@SuppressWarnings("unchecked")
private <T> List<T> createContentProcessor(Class<T> clazz, String defaultPackage, String[] classes) throws InvalidParameterException {
List<T> list = new ArrayList<>();
for (String c : classes) {
try {
Object instance = BuilderUtils.getClassFromString(c, defaultPackage).newInstance();
if (clazz.isInstance(instance)) {
list.add((T) instance);
} else {
LOGGER.warn(String.format("Unable to create, as %s is not of type %s", c, clazz.getName()));
}
} catch (InstantiationException | IllegalAccessException e) {
LOGGER.info("Could not find or instantiate " + c, e);
}
}
return list;
}
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
try {
Extraction extraction = extract(stream, source);
Document document = Jsoup.parse(extraction.getHtml());
// Add information on content mappers and content manipulators to the metadata
contentManipulatorClasses.forEach(
c -> document.head().appendElement("meta").attr("name", METADATA_CONTENT_MANIPULATORS)
.attr("content", c));
contentMapperClasses
.forEach(c -> document.head().appendElement("meta").attr("name", METADATA_CONTENT_MAPPERS)
.attr("content", c));
for (ContentManipulator manipulator : manipulators) {
manipulator.manipulate(document);
}
documentConverter.apply(document, jCas);
super.doProcessStream(stream, source, jCas);
// Run the text block extraction (if configured)
if (textBlocks != null) {
textBlocks.process(jCas);
}
} catch (Exception e) {
getMonitor().warn("Couldn't extract structure from document '{}'", source, e);
setCorrupt(jCas);
}
}
/**
* Perform actual extraction.
*
* THis is a separate function to allow it to be overridden during testing (or by other
* implementations).
*
* @param stream the stream
* @param source the source
* @return the extraction
* @throws ExtractionException the extraction exception
*/
protected Extraction extract(InputStream stream, String source) throws ExtractionException {
return formatExtractor.parse(stream, source);
}
/**
* Mark a document as corrupt.
*
* @param jCas the jCas
*/
private void setCorrupt(JCas jCas) {
if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
jCas.setDocumentText(CORRUPT_FILE_TEXT);
}
}
@Override
public void doDestroy() {
if (textBlocks != null) {
textBlocks.destroy();
textBlocks = null;
}
// Destroy all the content mapper and manipulators
manipulators.forEach(ContentManipulator::destroy);
mappers.forEach(ContentMapper::destroy);
super.doDestroy();
}
}