package eu.europeana.cloud.service.dps.storm.topologies.indexer; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.JsonParseException; import com.google.gson.JsonParser; import eu.europeana.cloud.common.model.dps.TaskState; import eu.europeana.cloud.service.dps.PluginParameterKeys; import eu.europeana.cloud.service.dps.index.IndexFields; import eu.europeana.cloud.service.dps.index.Indexer; import eu.europeana.cloud.service.dps.index.IndexerFactory; import eu.europeana.cloud.service.dps.storm.AbstractDpsBolt; import eu.europeana.cloud.service.dps.storm.StormTaskTuple; import eu.europeana.cloud.service.dps.util.LRUCache; import eu.europeana.cloud.service.dps.index.SupportedIndexers; import eu.europeana.cloud.service.dps.index.exception.IndexerException; import eu.europeana.cloud.service.dps.index.structure.IndexerInformations; import java.io.PrintWriter; import java.io.StringWriter; import java.util.Date; import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Index document by selected {@link Indexer}. * * @author Pavel Kefurt <Pavel.Kefurt@gmail.com> */ public class IndexBolt extends AbstractDpsBolt { private static final Logger LOGGER = LoggerFactory.getLogger(IndexBolt.class); private final Map<SupportedIndexers, String> clastersAddresses; private final int cacheSize; private transient LRUCache<String, Indexer> clients; /** * Constructor of Index bolt. * * @param clastersAddresses map of indexers and their connection strings * @param cacheSize number of established connection in cache. */ public IndexBolt(Map<SupportedIndexers, String> clastersAddresses, int cacheSize) { this.clastersAddresses = clastersAddresses; this.cacheSize = cacheSize; } @Override public void execute(StormTaskTuple t) { Indexer indexer = getIndexer(t.getParameter(PluginParameterKeys.INDEXER)); if (indexer == null) { LOGGER.warn("No indexer. Task {} is dropped.", t.getTaskId()); emitDropNotification(t.getTaskId(), t.getFileUrl(), "No indexer.", t.getParameters().toString()); endTask(t.getTaskId(), "No indexer. Task " + t.getTaskId() + " is dropped.", TaskState.DROPPED, new Date()); outputCollector.ack(inputTuple); return; } String fileWithDataForIndex = t.getFileUrl(); String rawData = new String(t.getFileData()); String originalFile = t.getParameter(PluginParameterKeys.ORIGINAL_FILE_URL); String fileMetadata = t.getParameter(PluginParameterKeys.FILE_METADATA); //extracted metadata //prepare data JsonObject data = new JsonObject(); if (rawData != null && !rawData.isEmpty()) { try { data = new JsonParser().parse(rawData).getAsJsonObject(); } catch (JsonParseException ex) //is not valid JSON => store as string { data.addProperty(IndexFields.RAW_TEXT.toString(), rawData); } } if (fileMetadata != null && !fileMetadata.isEmpty()) { try { JsonElement meta = new JsonParser().parse(fileMetadata); data.add(IndexFields.FILE_METADATA.toString(), meta); } catch (JsonParseException ex) //is not valid JSON { data.addProperty(IndexFields.FILE_METADATA.toString(), fileMetadata); } } try { //determine what I am indexing if (originalFile != null && !originalFile.isEmpty()) { //I am indexing extracted data from other file (e.g. features from binary file) //If this record already exists, than update fields only indexer.update(originalFile, data.toString()); } else if (fileWithDataForIndex != null && !fileWithDataForIndex.isEmpty()) { //I am indexing data from current file (e.g. txt file) //If this record already exists, than update fields only indexer.update(fileWithDataForIndex, data.toString()); } else { //I am indexing something other (_id for record will be generated) //only create new document - update is not possible indexer.insert(data.toString()); } } catch (IndexerException ex) { LOGGER.warn("Cannot index data from tastk {} because: {}", t.getTaskId(), ex.getMessage()); StringWriter stack = new StringWriter(); ex.printStackTrace(new PrintWriter(stack)); emitErrorNotification(t.getTaskId(), t.getFileUrl(), "Cannot index data because: " + ex.getMessage(), stack.toString()); endTask(t.getTaskId(), ex.getMessage(), TaskState.DROPPED, new Date()); outputCollector.ack(inputTuple); return; } LOGGER.info("Data from task {} is indexed.", t.getTaskId()); updateTask(t.getTaskId(), "", TaskState.CURRENTLY_PROCESSING, new Date()); outputCollector.emit(inputTuple, t.toStormTuple()); outputCollector.ack(inputTuple); } @Override public void prepare() { clients = new LRUCache<>(cacheSize); } private Indexer getIndexer(String data) { IndexerInformations ii = IndexerInformations.fromTaskString(data); if (ii == null) { return null; } String key = ii.toKey(); if (clients.containsKey(key)) { return clients.get(key); } //key not exists => open new connection and add it to cache ii.setAddresses(clastersAddresses.get(ii.getIndexerName())); Indexer client = IndexerFactory.getIndexer(ii); clients.put(key, client); return client; } }