TearlineContentExtractor.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.contentextractors;

import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.uima.UimaContext;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.xml.sax.SAXException;

import uk.gov.dstl.baleen.contentextractors.helpers.AbstractContentExtractor;

/**
 * Extracts metadata and content from an InputStream, and sets the first tearline of the document as the content.
 * 
 * 
 * @baleen.javadoc
 */
public class TearlineContentExtractor extends AbstractContentExtractor {
	private Pattern tearlinePattern;
	
	/**
	 * A list of boilerplate regular expressions that will be removed from the document (after tearlining, and case sensitively).
	 * 
	 * @baleen.config
	 */
	public static final String PARAM_BOILERPLATE = "boilerplate";
	@ConfigurationParameter(name = PARAM_BOILERPLATE, defaultValue = {})
	List<String> boilerplate;
	
	/**
	 * The regular expression that is used to identify tearlines in the document. If no tearlines are matched, then the whole document is returned.
	 * 
	 * @baleen.config [\\h]*[\\p{Pc}\\p{Pd}]+[\\h]*tear[\\h]*line[\\h]*[\\p{Pc}\\p{Pd}]+[\\h]*
	 */
	public static final String PARAM_TEARLINE = "tearline";
	@ConfigurationParameter(name = PARAM_TEARLINE, defaultValue = "[\\h]*[\\p{Pc}\\p{Pd}]+[\\h]*tear[\\h]*line[\\h]*[\\p{Pc}\\p{Pd}]+[\\h]*")
	String tearline;
	
	@Override
	public void doInitialize(UimaContext context, Map<String, Object> params) throws ResourceInitializationException {
		super.doInitialize(context, params);
		
		tearlinePattern = Pattern.compile(tearline, Pattern.CASE_INSENSITIVE);
	}
	
	@Override
	public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
		super.doProcessStream(stream, source, jCas);

		try {
			BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE);
			Metadata metadata = new Metadata();
			ParseContext context = new ParseContext();

			AutoDetectParser autoParser = new AutoDetectParser();
			autoParser.parse(stream, textHandler, metadata, context);

			String fullContent = textHandler.toString();
			Matcher m = tearlinePattern.matcher(fullContent);
			if(m.find()){
				jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim());
			}else{
				jCas.setDocumentText(removeBoilerplate(fullContent).trim());
			}

			for (String name : metadata.names()) {
				addMetadata(jCas, name, metadata.get(name));
			}
		} catch (SAXException | TikaException e) {
			getMonitor().warn("Couldn't parse metadata from '{}'", source, e);
		}
	}
	
	private String removeBoilerplate(String content){
		String ret = content;
		
		for(String s : boilerplate){
			ret = ret.replaceAll(s, "");
		}
		
		return ret;
	}
}