/* * JBoss, Home of Professional Open Source * Copyright 2012 Red Hat Inc. and/or its affiliates and other contributors * as indicated by the @authors tag. All rights reserved. */ package org.jboss.elasticsearch.river.remote; import java.io.IOException; import java.util.Date; import java.util.Map; import org.elasticsearch.action.bulk.BulkRequestBuilder; /** * Class used to run one index update process for one Space. Incremental indexing process is based on date of last * document update. * <p> * Uses search of data from remote system over timestamp of last update. Documents returned from remote system client * MUST BE ascending ordered by timestamp of last update also! * <p> * Can be used only for one run, then must be discarded and new instance created! * * @author Vlastimil Elias (velias at redhat dot com) */ public class SpaceByLastUpdateTimestampIndexer extends SpaceIndexerBase { /** * Property value where "last indexed document update date" is stored * * @see IESIntegration#storeDatetimeValue(String, String, Date, BulkRequestBuilder) * @see IESIntegration#readDatetimeValue(String, String) */ protected static final String STORE_PROPERTYNAME_LAST_INDEXED_DOC_UPDATE_DATE = "lastIndexedDocumentUpdateDate"; /** * Create and configure indexer. * * @param spaceKey to be indexed by this indexer. * @param fullUpdate true to request full index update (it may be changed from false to true by this indexer if * necessary) * @param remoteSystemClient configured client to be used to obtain informations from remote system. * @param esIntegrationComponent to be used to call River component and ElasticSearch functions * @param documentIndexStructureBuilder to be used during indexing */ public SpaceByLastUpdateTimestampIndexer(String spaceKey, boolean fullUpdate, IRemoteSystemClient remoteSystemClient, IESIntegration esIntegrationComponent, IDocumentIndexStructureBuilder documentIndexStructureBuilder) { super(spaceKey, remoteSystemClient, esIntegrationComponent, documentIndexStructureBuilder); logger = esIntegrationComponent.createLogger(SpaceByLastUpdateTimestampIndexer.class); indexingInfo = new SpaceIndexingInfo(spaceKey, fullUpdate); } @Override protected void processUpdate() throws Exception { indexingInfo.documentsUpdated = 0; Date updatedAfter = null; if (!indexingInfo.fullUpdate) { updatedAfter = readLastDocumentUpdatedDate(spaceKey); } Date updatedAfterStarting = updatedAfter; if (updatedAfter == null) indexingInfo.fullUpdate = true; Date lastDocumentUpdatedDate = null; int startAt = 0; logger.info("Go to perform {} update for Space {}", indexingInfo.fullUpdate ? "full" : "incremental", spaceKey); boolean cont = true; while (cont) { if (isClosed()) throw new InterruptedException("Interrupted because River is closed"); if (logger.isDebugEnabled()) logger.debug("Go to ask remote system for updated documents for space {} with startAt {} and updated {}", spaceKey, startAt, (updatedAfter != null ? ("after " + updatedAfter) : "in whole history")); ChangedDocumentsResults res = remoteSystemClient.getChangedDocuments(spaceKey, startAt, indexingInfo.fullUpdate, updatedAfter); if (res.getDocumentsCount() == 0) { cont = false; } else { if (isClosed()) throw new InterruptedException("Interrupted because River is closed"); Date firstDocumentUpdatedDate = null; int updatedInThisBulk = 0; boolean deletedInThisBulk = false; BulkRequestBuilder esBulk = esIntegrationComponent.prepareESBulkRequestBuilder(); for (Map<String, Object> document : res.getDocuments()) { String documentId = getDocumentIdChecked(document); if (getDocumentDetail(documentId, document)) { lastDocumentUpdatedDate = documentIndexStructureBuilder.extractDocumentUpdated(document); logger.debug("Go to update index for document '{}' with updated {}", documentId, lastDocumentUpdatedDate); if (lastDocumentUpdatedDate == null) { throw new IllegalArgumentException("Last update timestamp not found in data for document " + documentId); } if (firstDocumentUpdatedDate == null) { firstDocumentUpdatedDate = lastDocumentUpdatedDate; } if (documentIndexStructureBuilder.extractDocumentDeleted(document)) { deletedInThisBulk = prepareDeleteByRemoteDocumentId(esBulk, documentId) || deletedInThisBulk; } else { documentIndexStructureBuilder.indexDocument(esBulk, spaceKey, document); updatedInThisBulk++; } } if (isClosed()) throw new InterruptedException("Interrupted because River is closed"); } if (lastDocumentUpdatedDate != null) storeLastDocumentUpdatedDate(esBulk, spaceKey, lastDocumentUpdatedDate); if (updatedInThisBulk > 0 || deletedInThisBulk) { executeBulkUpdate(esBulk); indexingInfo.documentsUpdated += updatedInThisBulk; } // next logic depends on documents sorted by update timestamp ascending when returned from remote system if (lastDocumentUpdatedDate != null && firstDocumentUpdatedDate != null && !lastDocumentUpdatedDate.equals(firstDocumentUpdatedDate)) { // processed documents updated in different times, so we can continue by document filtering based on latest // time // of update which is more safe for concurrent changes in the remote system updatedAfter = lastDocumentUpdatedDate; if (res.getTotal() != null) cont = res.getTotal() > (res.getStartAt() + res.getDocumentsCount()); startAt = 0; } else { // no any documents found in batch // OR // more documents updated in same time, we must go over them using pagination only, which may sometimes lead // to some document update lost due concurrent changes in the remote system. But we can do it only if Total // is available from response! if (res.getTotal() != null) { startAt = res.getStartAt() + res.getDocumentsCount(); cont = res.getTotal() > startAt; } else { long t = 0; if (lastDocumentUpdatedDate != null) { t = lastDocumentUpdatedDate.getTime(); } else if (firstDocumentUpdatedDate != null) { t = firstDocumentUpdatedDate.getTime(); } if (t > 0) { updatedAfter = new Date(t + 1000); logger .warn( "All documents loaded from remote system for space '{}' contain same update timestamp {}, but we have no total count from response, so we may miss some documents because we shift timestamp for new request by one second to {}!", spaceKey, lastDocumentUpdatedDate, updatedAfter); startAt = 0; } else { logger .warn( "All documents loaded from remote system for space '{}' are unreachable and we have no total count of records, so we have to finish indexing for now.", spaceKey); cont = false; } } } } } if (indexingInfo.documentsUpdated > 0 && lastDocumentUpdatedDate != null && updatedAfterStarting != null && updatedAfterStarting.equals(lastDocumentUpdatedDate)) { // no any new document during this update cycle, go to increment lastDocumentUpdatedDate in store by one second // not to index last document again and again in next cycle storeLastDocumentUpdatedDate(null, spaceKey, new Date(lastDocumentUpdatedDate.getTime() + 1000)); } } /** * Get date of last document updated for given Space from persistent store inside ES cluster, so we can continue in * update process from this point. * * @param spaceKey to get date for. * @return date of last document updated or null if not available (in this case indexing starts from the beginning of * Space history) * @throws IOException * @see #storeLastDocumentUpdatedDate(BulkRequestBuilder, String, Date) */ protected Date readLastDocumentUpdatedDate(String spaceKey) throws Exception { return esIntegrationComponent.readDatetimeValue(spaceKey, STORE_PROPERTYNAME_LAST_INDEXED_DOC_UPDATE_DATE); } /** * Store date of last document updated for given Space into persistent store inside ES cluster, so we can continue in * update process from this point next time. * * @param esBulk ElasticSearch bulk request to be used for update * @param spaceKey store date for. * @param lastDocumentUpdatedDate date to store * @throws Exception * @see #readLastDocumentUpdatedDate(String) */ protected void storeLastDocumentUpdatedDate(BulkRequestBuilder esBulk, String spaceKey, Date lastDocumentUpdatedDate) throws Exception { esIntegrationComponent.storeDatetimeValue(spaceKey, STORE_PROPERTYNAME_LAST_INDEXED_DOC_UPDATE_DATE, lastDocumentUpdatedDate, esBulk); } }