package eu.europeana.cloud.service.dps.storm.topologies.text; import backtype.storm.topology.OutputFieldsDeclarer; import com.google.gson.Gson; import eu.europeana.cloud.service.dps.PluginParameterKeys; import eu.europeana.cloud.service.dps.storm.AbstractDpsBolt; import eu.europeana.cloud.service.dps.storm.NotificationTuple; import eu.europeana.cloud.service.dps.storm.StormTaskTuple; import eu.europeana.cloud.service.dps.storm.transform.text.TextExtractor; import eu.europeana.cloud.service.dps.storm.transform.text.TextExtractorFactory; import java.io.ByteArrayInputStream; import java.nio.charset.Charset; import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Bolt for text extracting. * It uses {@link DpsTask} parameter with key {@link PluginParameterKeys.EXTRACTOR} for determine which method should be used. * * @author Pavel Kefurt <Pavel.Kefurt@gmail.com> */ public class ExtractTextBolt extends AbstractDpsBolt { private final String storeStremName; private final String defaultStreamName; private static final Logger LOGGER = LoggerFactory.getLogger(ExtractTextBolt.class); /** * Constructor of extract text bolt with default stream. */ public ExtractTextBolt() { this(null, null); } /** * Constructor of extract text bolt with one stream. * * @param defaultStreamName stream name - null => use default stream */ public ExtractTextBolt(String defaultStreamName) { this(defaultStreamName, null); } /** * Constructor of extract text bolt with two streams. * If defaultStreamName and storeStremName are null then it use default stream. * * @param defaultStreamName name of stream for emit task if {@link PluginParameterKeys#STORE_EXTRACTED_TEXT} is false * @param storeStremName name of stream for emit task if {@link PluginParameterKeys#STORE_EXTRACTED_TEXT} is true */ public ExtractTextBolt(String defaultStreamName, String storeStremName) { if (storeStremName == null) { storeStremName = defaultStreamName; } else if (defaultStreamName == null) { defaultStreamName = storeStremName; } this.storeStremName = storeStremName; this.defaultStreamName = defaultStreamName; } @Override public void execute(StormTaskTuple t) { String representationName = t.getParameter(PluginParameterKeys.REPRESENTATION_NAME); String fileFormats = t.getParameter(PluginParameterKeys.FILE_FORMATS); String extractorName; if (fileFormats != null && !fileFormats.isEmpty()) { Map<String, String> formats = new Gson().fromJson(fileFormats, Map.class); String format = formats.get(representationName); if (format != null && !format.isEmpty()) { extractorName = format; } else { extractorName = representationName; } } else { extractorName = representationName; } String extractors = t.getParameter(PluginParameterKeys.EXTRACTORS); String extractionMetodName = null; if (extractors != null && !extractors.isEmpty()) { Map<String, String> extractors_ = new Gson().fromJson(extractors, Map.class); extractionMetodName = extractors_.get(extractorName); } TextExtractor extractor = TextExtractorFactory.getExtractor(extractorName, extractionMetodName); if (extractor == null) { String message = String.format("Extractor does not exist for extractor name %s.", extractorName); LOGGER.warn(message); emitDropNotification(t.getTaskId(), t.getFileUrl(), message, t.getParameters().toString()); return; } else { LOGGER.info("Extractor name: {}, Required extraction method: {}, Selected extraction method: {}", extractorName, extractionMetodName, extractor.getExtractionMethod().name()); } ByteArrayInputStream data = t.getFileByteDataAsStream(); String extractedText = extractor.extractText(data); Map<String, String> metadata = extractor.getExtractedMetadata(); if (extractedText != null && !extractedText.isEmpty()) { t.setFileData(extractedText.getBytes(Charset.forName("UTF-8"))); t.addParameter(PluginParameterKeys.MIME_TYPE, "text/plain"); t.addParameter(PluginParameterKeys.REPRESENTATION_NAME, extractor.getRepresentationName()); t.addParameter(PluginParameterKeys.ORIGINAL_FILE_URL, t.getFileUrl()); if (metadata != null && !metadata.isEmpty()) { t.addParameter(PluginParameterKeys.FILE_METADATA, new Gson().toJson(metadata)); } if (storeStremName != null && defaultStreamName != null) { //store extracted text? if (Boolean.parseBoolean(t.getParameter(PluginParameterKeys.STORE_EXTRACTED_TEXT))) { outputCollector.emit(storeStremName, inputTuple, t.toStormTuple()); } else { outputCollector.emit(defaultStreamName, inputTuple, t.toStormTuple()); } } else { outputCollector.emit(inputTuple, t.toStormTuple()); } } else { String message = String.format("Cannot extract data from %s by %s. (or file is empty)", extractorName, extractor.getExtractionMethod().name()); emitDropNotification(t.getTaskId(), t.getFileUrl(), message, t.getParameters().toString()); } } @Override public void prepare() { } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { if (storeStremName != null && defaultStreamName != null) { if (!storeStremName.equals(defaultStreamName)) { //store branch declarer.declareStream(storeStremName, StormTaskTuple.getFields()); } //default branch declarer.declareStream(defaultStreamName, StormTaskTuple.getFields()); } else { declarer.declare(StormTaskTuple.getFields()); } //notifications declarer.declareStream(NOTIFICATION_STREAM_NAME, NotificationTuple.getFields()); } }