package eu.europeana.cloud.service.dps.storm.topologies.indexer; import eu.europeana.cloud.common.model.dps.TaskState; import eu.europeana.cloud.service.dps.PluginParameterKeys; import eu.europeana.cloud.service.dps.index.Indexer; import eu.europeana.cloud.service.dps.index.IndexerFactory; import eu.europeana.cloud.service.dps.index.SupportedIndexers; import eu.europeana.cloud.service.dps.index.exception.IndexerException; import eu.europeana.cloud.service.dps.index.structure.IndexedDocument; import eu.europeana.cloud.service.dps.index.structure.IndexerInformations; import eu.europeana.cloud.service.dps.storm.AbstractDpsBolt; import eu.europeana.cloud.service.dps.storm.StormTaskTuple; import eu.europeana.cloud.service.dps.util.LRUCache; import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; import java.util.*; import org.apache.commons.lang.NotImplementedException; import org.codehaus.jackson.map.ObjectMapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Marge more indexed documents to one new document. * * @author Pavel Kefurt <Pavel.Kefurt@gmail.com> */ public class MergeIndexedDocumentsBolt extends AbstractDpsBolt { private static final Logger LOGGER = LoggerFactory.getLogger(MergeIndexedDocumentsBolt.class); private final Map<SupportedIndexers, String> clastersAddresses; private final int cacheSize; private transient LRUCache<String, Indexer> clients; /** * Constructor of MergeIndexedDocumentsBolt. * * @param clastersAddresses map of indexers and their connection strings * @param cacheSize number of established connection in cache. */ public MergeIndexedDocumentsBolt(Map<SupportedIndexers, String> clastersAddresses, int cacheSize) { this.clastersAddresses = clastersAddresses; this.cacheSize = cacheSize; } @Override public void execute(StormTaskTuple t) { Indexer indexer = getIndexer(t.getParameter(PluginParameterKeys.INDEXER)); if (indexer == null) { LOGGER.warn("No indexer. Task {} is dropped.", t.getTaskId()); emitDropNotification(t.getTaskId(), t.getFileUrl(), "No indexer.", t.getParameters().toString()); endTask(t.getTaskId(), "No indexer. Task " + t.getTaskId() + " is dropped.", TaskState.DROPPED, new Date()); outputCollector.ack(inputTuple); return; } List<String> docIds = getDocumentIdsFromAnnotation(t.getFileUrl()); Map<String, Object> mergedData = new HashMap<>(); for (String docId : docIds) { try { //retrieve data from index IndexedDocument document = indexer.getDocument(docId); Map<String, Object> data = document.getData(); //merge data for (Map.Entry<String, Object> d : data.entrySet()) { String key = d.getKey(); if (mergedData.containsKey(key)) //key already exists => conflict { Object val = d.getValue(); Object mData = mergedData.get(key); if (val instanceof Collection<?>) //new value is list { if (mData instanceof Collection<?>) //marged data are list as well { Collection<Object> tmp = (Collection) mData; tmp.addAll((Collection) val); } else { Collection<Object> tmp = (Collection) val; tmp.add(mData); mergedData.put(key, tmp); } } else if (mData instanceof Collection<?>) //marged data already contains list { Collection<Object> tmp = (Collection) mData; tmp.add(val); } else //put list to margedData { List<Object> l = new ArrayList<>(); l.add(mData); l.add(val); mergedData.put(key, l); } } else { mergedData.put(d.getKey(), d.getValue()); } } } catch (IndexerException ex) { LOGGER.warn("Cannot read indexed document because: " + ex.getMessage()); } } try { t.setFileData(new ObjectMapper().writeValueAsBytes(mergedData)); } catch (IOException ex) { LOGGER.warn("Cannot serialize merged data because: " + ex.getMessage()); StringWriter stack = new StringWriter(); ex.printStackTrace(new PrintWriter(stack)); emitDropNotification(t.getTaskId(), t.getFileUrl(), "Cannot serialize merged data.", stack.toString()); endTask(t.getTaskId(), ex.getMessage(), TaskState.DROPPED, new Date()); outputCollector.ack(inputTuple); return; } LOGGER.info("Merged documents: {}", docIds); outputCollector.emit(inputTuple, t.toStormTuple()); outputCollector.ack(inputTuple); } @Override public void prepare() { clients = new LRUCache<>(cacheSize); } private Indexer getIndexer(String data) { IndexerInformations ii = IndexerInformations.fromTaskString(data); if (ii == null) { return null; } String key = ii.toKey(); if (clients.containsKey(key)) { return clients.get(key); } //key not exists => open new connection and add it to cache ii.setAddresses(clastersAddresses.get(ii.getIndexerName())); Indexer client = IndexerFactory.getIndexer(ii); clients.put(key, client); return client; } private List<String> getDocumentIdsFromAnnotation(String annotationUrl) { //TODO: retrieve IDs from annotation throw new NotImplementedException("Waiting for Annotation service!"); } }