StructureContentExtractor.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
// Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.contentextractors;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

import org.apache.uima.UimaContext;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Strings;
import com.google.common.collect.Lists;

import io.committed.krill.extraction.Extraction;
import io.committed.krill.extraction.exception.ExtractionException;
import io.committed.krill.extraction.tika.TikaFormatExtractor;
import uk.gov.dstl.baleen.common.structure.TextBlocks;
import uk.gov.dstl.baleen.contentextractors.helpers.AbstractContentExtractor;
import uk.gov.dstl.baleen.contentextractors.helpers.DocumentToJCasConverter;
import uk.gov.dstl.baleen.contentmanipulators.helpers.ContentManipulator;
import uk.gov.dstl.baleen.contentmappers.MetaTags;
import uk.gov.dstl.baleen.contentmappers.StructuralAnnotations;
import uk.gov.dstl.baleen.contentmappers.helpers.ContentMapper;
import uk.gov.dstl.baleen.core.utils.BaleenDefaults;
import uk.gov.dstl.baleen.core.utils.BuilderUtils;
import uk.gov.dstl.baleen.exceptions.InvalidParameterException;
import uk.gov.dstl.baleen.types.structure.Structure;

/**
 * Extracts metadata, structural annotations and text content from the supplied input.
 * 
 * Structural annotations are as defined under the Baleen type system with top level
 * {@link Structure} class.
 * 
 * Structural extraction allows better understanding of the document by downstream annotators which
 * can use the information to segregate the document, rather than treating it as a whole. For
 * example, consider using a regex for each cell in a table which is different to considering the
 * flat text version of the entire table.
 * 
 * The process of structural content extraction is as follows:
 * 
 * <ul>
 * <li>The document is parsed and converted to a rich HTML representation. This is a general 'per
 * document format' conversion.
 * <li>A set of content manipulators act on the HTML which are configured for this Baleen pipeline.
 * These can do anything (add new nodes, remove or amend text, etc). They might be used to clean up
 * the HTML or to remove elements which aren't required by the pipeline.
 * <li>A set of content mappers convert the the HTML nodes into annotations. They may create
 * structural elements, or other types such as metadata or entities. The set of content mappers is
 * configurable per pipeline.
 * <li>The text of the document is extracted. Note that the content mappers can not change the text
 * output, if you wish to change the text output then use a content manipulator.
 * </ul>
 * 
 * Note that content mapper and content manipulators can work in isolation or in coordination. By
 * coordination we mean that a content manipulator might find the most likely title in a document,
 * and mark it via introduction of a new HTML span element with a class title. A special content
 * mapper could then look for this span and add the title as metadata.
 * 
 * To configure content mappers and manipulators, and to use the structural content extractor,
 * define your collection reader as follows.
 * 
 * <pre>
 * collectionreader:
 *   class: FolderReader
 *   contentExtractor: StructureContentExtractor
 *   extractTextBlocks: true
 *   contentManipulators:
 *   - RemoveEmptyText
 *   contentMappers:
 *   - SemanticHtml
 *   - MetaTags
 *   folders:
 *   - ./input
 * </pre>
 * 
 * If you do not include contentManipulators then none will be used. If you omit the contentMappers
 * then the default StructuralAnnotations and MetaTags mapper will be used.
 * 
 * The default value of extractTextBlocks is true. This means that the TextBlocks annotation will be
 * run immediately. If you do not which to run this annotator then set the value to false. Running
 * by default since otherwise the structural annotations extracted here are ignored by the rest of
 * the pipeline. Pipeline developers may wish to disable this so they can configure the TextBlock
 * annotator specifically.
 * 
 * Note that structured extraction will only work (or be beneficial) on certain document types such
 * as DOC, DOCX, PPT/X, XLS/X, PDF and HTML.
 * 
 */
public class StructureContentExtractor extends AbstractContentExtractor {

	public static final String FIELD_CONTENT_MAPPERS = "contentMappers";

	public static final String FIELD_CONTENT_MANIPULATORS = "contentManipulators";

	public static final String FIELD_EXTRACT_TEXT_BLOCKS = "extractTextBlocks";


	/** The Constant LOGGER. */
	private static final Logger LOGGER = LoggerFactory.getLogger(StructureContentExtractor.class);

	public static final String CORRUPT_FILE_TEXT = "FILE CONTENTS CORRUPT - UNABLE TO PROCESS";

	private static final String METADATA_CONTENT_MANIPULATORS = "baleen:content-manipulators";
	private static final String METADATA_CONTENT_MAPPERS = "baleen:content-mappers";

	private List<String> contentManipulatorClasses;
	private List<String> contentMapperClasses;

	private List<ContentManipulator> manipulators = Collections.emptyList();

	private DocumentToJCasConverter documentConverter;

	private TikaFormatExtractor formatExtractor;

	private TextBlocks textBlocks = null;

	private List<ContentMapper> mappers = Collections.emptyList();

	@Override
	public void doInitialize(UimaContext context, Map<String, Object> params) throws ResourceInitializationException {
		super.doInitialize(context, params);

		Object manipulatorConfig = params.get(FIELD_CONTENT_MANIPULATORS);
		if (manipulatorConfig != null && manipulatorConfig instanceof String[]) {
			try {
				manipulators = createContentProcessor(ContentManipulator.class, BaleenDefaults.DEFAULT_CONTENT_MANIPULATOR_PACKAGE, (String[]) manipulatorConfig);

				// Initialise the manipulators
				initialiseManipulators(context);
			} catch (InvalidParameterException e) {
				throw new ResourceInitializationException(e);
			}
		}


		Object mapperConfig = params.get(FIELD_CONTENT_MAPPERS);
		if (mapperConfig != null && mapperConfig instanceof String[]) {
			try {
				mappers = createContentProcessor(ContentMapper.class, BaleenDefaults.DEFAULT_CONTENT_MAPPER_PACKAGE, (String[]) mapperConfig);

				// Initialise the mappers
				initialiseMappers(context);
			} catch (InvalidParameterException e) {
				throw new ResourceInitializationException(e);
			}
		} else {
			// Defaults to extraction of the Structural Annotations only
			mappers = Lists.newArrayList(new StructuralAnnotations(), new MetaTags());
		}


		contentManipulatorClasses =
				manipulators.stream().map(m -> m.getClass().getName()).collect(Collectors.toList());
		contentMapperClasses =
				mappers.stream().map(m -> m.getClass().getName()).collect(Collectors.toList());

		documentConverter = new DocumentToJCasConverter(mappers);
		formatExtractor = new TikaFormatExtractor();

		// Run the text block annotator after the configuration

		Object extractTextBlockConfig = params.get(FIELD_EXTRACT_TEXT_BLOCKS);
		boolean runTextBlocks = true;
		if (extractTextBlockConfig != null) {
			if (extractTextBlockConfig instanceof String) {
				if ("false".equalsIgnoreCase((String) extractTextBlockConfig)
						|| "no".equalsIgnoreCase((String) extractTextBlockConfig)) {
					runTextBlocks = false;
				}
			} else if (extractTextBlockConfig instanceof Boolean) {
				runTextBlocks = (Boolean) extractTextBlockConfig;

			}
		}

		if (runTextBlocks) {
			textBlocks = new TextBlocks();
			textBlocks.initialize(context);
		}
	}
	
	private void initialiseMappers(UimaContext context){
		mappers.forEach(m -> {
			try {
				m.initialize(context);
			} catch (Exception e) {
				getMonitor().warn("Unable to initialse content mapper: {}", e);
			}
		});
	}
	
	private void initialiseManipulators(UimaContext context){
		manipulators.forEach(m -> {
			try {
				m.initialize(context);
			} catch (Exception e) {
				getMonitor().warn("Unable to initialse content manipulator: {}", e);
			}
		});
	}

	/**
	 * Creates the content processor (ie a mapper or a manipulator).
	 *
	 * @param <T> the generic type
	 * @param clazz the clazz (of T)
	 * @param defaultPackage the default package to look in
	 * @param context the context
	 * @param classes the classes
	 * @return the list
	 * @throws InvalidParameterException the invalid parameter exception
	 */
	// Note this is checked by clazz isInstance
	@SuppressWarnings("unchecked")
	private <T> List<T> createContentProcessor(Class<T> clazz, String defaultPackage, String[] classes) throws InvalidParameterException {
		List<T> list = new ArrayList<>();
		for (String c : classes) {
			try {
				Object instance = BuilderUtils.getClassFromString(c, defaultPackage).newInstance();

				if (clazz.isInstance(instance)) {
					list.add((T) instance);
				} else {
					LOGGER.warn(String.format("Unable to create, as %s is not of  type %s", c, clazz.getName()));
				}

			} catch (InstantiationException | IllegalAccessException e) {
				LOGGER.info("Could not find or instantiate  " + c, e);
			}
		}
		return list;
	}

	@Override
	public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {

		try {
			Extraction extraction = extract(stream, source);

			Document document = Jsoup.parse(extraction.getHtml());


			// Add information on content mappers and content manipulators to the metadata
			contentManipulatorClasses.forEach(
					c -> document.head().appendElement("meta").attr("name", METADATA_CONTENT_MANIPULATORS)
					.attr("content", c));
			contentMapperClasses
			.forEach(c -> document.head().appendElement("meta").attr("name", METADATA_CONTENT_MAPPERS)
					.attr("content", c));

			for (ContentManipulator manipulator : manipulators) {
				manipulator.manipulate(document);
			}

			documentConverter.apply(document, jCas);

			super.doProcessStream(stream, source, jCas);

			// Run the text block extraction (if configured)
			if (textBlocks != null) {
				textBlocks.process(jCas);
			}

		} catch (Exception e) {
			getMonitor().warn("Couldn't extract structure from document '{}'", source, e);
			setCorrupt(jCas);
		}
	}

	/**
	 * Perform actual extraction.
	 * 
	 * THis is a separate function to allow it to be overridden during testing (or by other
	 * implementations).
	 *
	 * @param stream the stream
	 * @param source the source
	 * @return the extraction
	 * @throws ExtractionException the extraction exception
	 */
	protected Extraction extract(InputStream stream, String source) throws ExtractionException {
		return formatExtractor.parse(stream, source);
	}

	/**
	 * Mark a document as corrupt.
	 *
	 * @param jCas the jCas
	 */
	private void setCorrupt(JCas jCas) {
		if (Strings.isNullOrEmpty(jCas.getDocumentText())) {
			jCas.setDocumentText(CORRUPT_FILE_TEXT);
		}
	}

	@Override
	public void doDestroy() {
		if (textBlocks != null) {
			textBlocks.destroy();
			textBlocks = null;
		}

		// Destroy all the content mapper and manipulators
		manipulators.forEach(ContentManipulator::destroy);
		mappers.forEach(ContentMapper::destroy);

		super.doDestroy();
	}

}