//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.contentextractors; import java.io.IOException; import java.io.InputStream; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; import org.apache.uima.UimaContext; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import org.xml.sax.SAXException; import uk.gov.dstl.baleen.contentextractors.helpers.AbstractContentExtractor; /** * Extracts metadata and content from an InputStream, and sets the first tearline of the document as the content. * * * @baleen.javadoc */ public class TearlineContentExtractor extends AbstractContentExtractor { private Pattern tearlinePattern; /** * A list of boilerplate regular expressions that will be removed from the document (after tearlining, and case sensitively). * * @baleen.config */ public static final String PARAM_BOILERPLATE = "boilerplate"; @ConfigurationParameter(name = PARAM_BOILERPLATE, defaultValue = {}) List<String> boilerplate; /** * The regular expression that is used to identify tearlines in the document. If no tearlines are matched, then the whole document is returned. * * @baleen.config [\\h]*[\\p{Pc}\\p{Pd}]+[\\h]*tear[\\h]*line[\\h]*[\\p{Pc}\\p{Pd}]+[\\h]* */ public static final String PARAM_TEARLINE = "tearline"; @ConfigurationParameter(name = PARAM_TEARLINE, defaultValue = "[\\h]*[\\p{Pc}\\p{Pd}]+[\\h]*tear[\\h]*line[\\h]*[\\p{Pc}\\p{Pd}]+[\\h]*") String tearline; @Override public void doInitialize(UimaContext context, Map<String, Object> params) throws ResourceInitializationException { super.doInitialize(context, params); tearlinePattern = Pattern.compile(tearline, Pattern.CASE_INSENSITIVE); } @Override public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException { super.doProcessStream(stream, source, jCas); try { BodyContentHandler textHandler = new BodyContentHandler(Integer.MAX_VALUE); Metadata metadata = new Metadata(); ParseContext context = new ParseContext(); AutoDetectParser autoParser = new AutoDetectParser(); autoParser.parse(stream, textHandler, metadata, context); String fullContent = textHandler.toString(); Matcher m = tearlinePattern.matcher(fullContent); if(m.find()){ jCas.setDocumentText(removeBoilerplate(fullContent.substring(0, m.start())).trim()); }else{ jCas.setDocumentText(removeBoilerplate(fullContent).trim()); } for (String name : metadata.names()) { addMetadata(jCas, name, metadata.get(name)); } } catch (SAXException | TikaException e) { getMonitor().warn("Couldn't parse metadata from '{}'", source, e); } } private String removeBoilerplate(String content){ String ret = content; for(String s : boilerplate){ ret = ret.replaceAll(s, ""); } return ret; } }