//Dstl (c) Crown Copyright 2017
// Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.contentextractors.helpers;
import java.io.IOException;
import java.io.InputStream;
import java.util.Map;
import org.apache.uima.UimaContext;
import org.apache.uima.fit.component.initialize.ConfigurationParameterInitializer;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.DocumentAnnotation;
import org.apache.uima.resource.ResourceInitializationException;
import uk.gov.dstl.baleen.uima.BaleenContentExtractor;
/**
* Provides a basis for content extractors, implementing common functionality.
*
* Sets the source and timestamp of the document, and the extraction class as metadata.
*
*/
public abstract class AbstractContentExtractor extends BaleenContentExtractor {
private static final String METADATA_KEY_CONTENT_EXTRACTOR = "baleen:content-extractor";
@Override
public void doProcessStream(InputStream stream, String source, JCas jCas) throws IOException {
DocumentAnnotation doc = getSupport().getDocumentAnnotation(jCas);
doc.setSourceUri(source);
doc.setTimestamp(System.currentTimeMillis());
// Add metadata item to capture which content extractor was used
addMetadata(jCas, METADATA_KEY_CONTENT_EXTRACTOR, this.getClass().getName());
}
@Override
public void doInitialize(UimaContext context, Map<String, Object> params) throws ResourceInitializationException {
ConfigurationParameterInitializer.initialize(this, params);
}
@Override
public void doDestroy() {
// Do nothing
}
}