/* * Licensed to Laurent Broudoux (the "Author") under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. Author licenses this * file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package com.github.lbroudoux.elasticsearch.river.s3.river; import java.util.*; import com.amazonaws.services.s3.model.AmazonS3Exception; import org.apache.tika.metadata.Metadata; import org.elasticsearch.ExceptionsHelper; import org.elasticsearch.action.admin.indices.mapping.put.PutMappingResponse; import org.elasticsearch.action.bulk.*; import org.elasticsearch.action.get.GetResponse; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.action.search.SearchType; import org.elasticsearch.client.Client; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.block.ClusterBlockException; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.cluster.metadata.MappingMetaData; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.io.stream.BytesStreamInput; import org.elasticsearch.common.util.concurrent.EsExecutors; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.support.XContentMapValues; import org.elasticsearch.indices.IndexAlreadyExistsException; import org.elasticsearch.river.AbstractRiverComponent; import org.elasticsearch.river.River; import org.elasticsearch.river.RiverName; import org.elasticsearch.river.RiverSettings; import org.elasticsearch.search.SearchHit; import com.amazonaws.services.s3.model.S3ObjectSummary; import com.github.lbroudoux.elasticsearch.river.s3.connector.S3ObjectSummaries; import com.github.lbroudoux.elasticsearch.river.s3.connector.S3Connector; import com.github.lbroudoux.elasticsearch.river.s3.river.TikaHolder; import org.elasticsearch.threadpool.ThreadPool; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; /** * A River component for scanning and indexing Amazon S3 documents into Elasticsearch. * @author laurent */ public class S3River extends AbstractRiverComponent implements River{ private final Client client; private final ThreadPool threadPool; private final String indexName; private final String typeName; private final int bulkSize; private RiverStatus riverStatus; private volatile Thread feedThread; private volatile BulkProcessor bulkProcessor; private volatile boolean closed = false; private final S3RiverFeedDefinition feedDefinition; private final S3Connector s3; @Inject @SuppressWarnings({ "unchecked" }) protected S3River(RiverName riverName, RiverSettings settings, Client client, ThreadPool threadPool) throws Exception{ super(riverName, settings); this.client = client; this.threadPool = threadPool; this.riverStatus = RiverStatus.UNKNOWN; // Deal with connector settings. if (settings.settings().containsKey("amazon-s3")){ Map<String, Object> feed = (Map<String, Object>)settings.settings().get("amazon-s3"); // Retrieve feed settings. String feedname = XContentMapValues.nodeStringValue(feed.get("name"), null); String bucket = XContentMapValues.nodeStringValue(feed.get("bucket"), null); String pathPrefix = XContentMapValues.nodeStringValue(feed.get("pathPrefix"), null); String downloadHost = XContentMapValues.nodeStringValue(feed.get("download_host"), null); int updateRate = XContentMapValues.nodeIntegerValue(feed.get("update_rate"), 15 * 60 * 1000); boolean jsonSupport = XContentMapValues.nodeBooleanValue(feed.get("json_support"), false); double indexedCharsRatio = XContentMapValues.nodeDoubleValue(feed.get("indexed_chars_ratio"), 0.0); String[] includes = S3RiverUtil.buildArrayFromSettings(settings.settings(), "amazon-s3.includes"); String[] excludes = S3RiverUtil.buildArrayFromSettings(settings.settings(), "amazon-s3.excludes"); // Retrieve connection settings. String accessKey = XContentMapValues.nodeStringValue(feed.get("accessKey"), null); String secretKey = XContentMapValues.nodeStringValue(feed.get("secretKey"), null); boolean useIAMRoleForEC2 = XContentMapValues.nodeBooleanValue(feed.get("use_EC2_IAM"), false); feedDefinition = new S3RiverFeedDefinition(feedname, bucket, pathPrefix, downloadHost, updateRate, Arrays.asList(includes), Arrays.asList(excludes), accessKey, secretKey, useIAMRoleForEC2, jsonSupport, indexedCharsRatio); } else { logger.error("You didn't define the amazon-s3 settings. Exiting... See https://github.com/lbroudoux/es-amazon-s3-river"); indexName = null; typeName = null; bulkSize = 100; feedDefinition = null; s3 = null; return; } // Deal with index settings if provided. if (settings.settings().containsKey("index")) { Map<String, Object> indexSettings = (Map<String, Object>)settings.settings().get("index"); indexName = XContentMapValues.nodeStringValue(indexSettings.get("index"), riverName.name()); typeName = XContentMapValues.nodeStringValue(indexSettings.get("type"), S3RiverUtil.INDEX_TYPE_DOC); bulkSize = XContentMapValues.nodeIntegerValue(indexSettings.get("bulk_size"), 100); } else { indexName = riverName.name(); typeName = S3RiverUtil.INDEX_TYPE_DOC; bulkSize = 100; } // We need to connect to Amazon S3 after ensure mandatory settings are here. if (feedDefinition.getBucket() == null){ logger.error("Amazon S3 bucket should not be null. Please fix this."); throw new IllegalArgumentException("Amazon S3 bucket should not be null."); } // Connect using the appropriate authentication process. if (feedDefinition.getAccessKey() == null && feedDefinition.getSecretKey() == null) { s3 = new S3Connector(feedDefinition.isUseIAMRoleForEC2()); } else { s3 = new S3Connector(feedDefinition.getAccessKey(), feedDefinition.getSecretKey()); } try { s3.connectUserBucket(feedDefinition.getBucket(), feedDefinition.getPathPrefix()); } catch (AmazonS3Exception ase){ logger.error("Exception while connecting Amazon S3 user bucket. " + "Either access key, secret key, IAM Role or bucket name are incorrect"); throw ase; } this.riverStatus = RiverStatus.INITIALIZED; } @Override public void start(){ if (logger.isInfoEnabled()){ logger.info("Starting amazon s3 river scanning"); } this.riverStatus = RiverStatus.STARTING; // Let's start this in another thread so we won't stop the start process threadPool.generic().execute(new Runnable() { @Override public void run() { // We are first waiting for a yellow state at least logger.debug("Waiting for yellow status"); client.admin().cluster().prepareHealth("_river").setWaitForYellowStatus().get(); logger.debug("Yellow or green status received"); try { // Create the index if it doesn't exist if (!client.admin().indices().prepareExists(indexName).execute().actionGet().isExists()) { client.admin().indices().prepareCreate(indexName).execute().actionGet(); } } catch (Exception e) { if (ExceptionsHelper.unwrapCause(e) instanceof IndexAlreadyExistsException){ // that's fine. } else if (ExceptionsHelper.unwrapCause(e) instanceof ClusterBlockException){ // ok, not recovered yet..., lets start indexing and hope we recover by the first bulk. } else { logger.warn("failed to create index [{}], disabling river...", e, indexName); return; } } try { // If needed, we create the new mapping for files if (!feedDefinition.isJsonSupport()) { pushMapping(indexName, typeName, S3RiverUtil.buildS3FileMapping(typeName)); } } catch (Exception e) { logger.warn("Failed to create mapping for [{}/{}], disabling river...", e, indexName, typeName); return; } // Creating bulk processor bulkProcessor = BulkProcessor.builder(client, new BulkProcessor.Listener() { @Override public void beforeBulk(long id, BulkRequest request) { logger.debug("Going to execute new bulk composed of {} actions", request.numberOfActions()); } @Override public void afterBulk(long id, BulkRequest request, BulkResponse response) { logger.debug("Executed bulk composed of {} actions", request.numberOfActions()); if (response.hasFailures()) { logger.warn("There was failures while executing bulk", response.buildFailureMessage()); if (logger.isDebugEnabled()) { for (BulkItemResponse item : response.getItems()) { if (item.isFailed()) { logger.debug("Error for {}/{}/{} for {} operation: {}", item.getIndex(), item.getType(), item.getId(), item.getOpType(), item.getFailureMessage()); } } } } } @Override public void afterBulk(long id, BulkRequest request, Throwable throwable) { logger.warn("Error executing bulk", throwable); } }) .setBulkActions(bulkSize) .build(); // We create as many Threads as there are feeds. feedThread = EsExecutors.daemonThreadFactory(settings.globalSettings(), "fs_slurper") .newThread(new S3Scanner(feedDefinition)); feedThread.start(); riverStatus = RiverStatus.RUNNING; } }); } @Override public void close(){ if (logger.isInfoEnabled()){ logger.info("Closing amazon s3 river"); } closed = true; riverStatus = RiverStatus.STOPPING; // We have to close the Thread. if (feedThread != null){ feedThread.interrupt(); } riverStatus = RiverStatus.STOPPED; } /** * Check if a mapping already exists in an index * @param index Index name * @param type Mapping name * @return true if mapping exists */ private boolean isMappingExist(String index, String type) { ClusterState cs = client.admin().cluster().prepareState() .setIndices(index).execute().actionGet() .getState(); // Check index metadata existence. IndexMetaData imd = cs.getMetaData().index(index); if (imd == null){ return false; } // Check mapping metadata existence. MappingMetaData mdd = imd.mapping(type); if (mdd != null){ return true; } return false; } private void pushMapping(String index, String type, XContentBuilder xcontent) throws Exception { if (logger.isTraceEnabled()){ logger.trace("pushMapping(" + index + ", " + type + ")"); } // If type does not exist, we create it boolean mappingExist = isMappingExist(index, type); if (!mappingExist) { logger.debug("Mapping [" + index + "]/[" + type + "] doesn't exist. Creating it."); // Read the mapping json file if exists and use it. if (xcontent != null){ if (logger.isTraceEnabled()){ logger.trace("Mapping for [" + index + "]/[" + type + "]=" + xcontent.string()); } // Create type and mapping PutMappingResponse response = client.admin().indices() .preparePutMapping(index) .setType(type) .setSource(xcontent) .execute().actionGet(); if (!response.isAcknowledged()){ throw new Exception("Could not define mapping for type [" + index + "]/[" + type + "]."); } else { if (logger.isDebugEnabled()){ if (mappingExist){ logger.debug("Mapping definition for [" + index + "]/[" + type + "] succesfully merged."); } else { logger.debug("Mapping definition for [" + index + "]/[" + type + "] succesfully created."); } } } } else { if (logger.isDebugEnabled()){ logger.debug("No mapping definition for [" + index + "]/[" + type + "]. Ignoring."); } } } else { if (logger.isDebugEnabled()){ logger.debug("Mapping [" + index + "]/[" + type + "] already exists and mergeMapping is not set."); } } if (logger.isTraceEnabled()){ logger.trace("/pushMapping(" + index + ", " + type + ")"); } } /** */ private class S3Scanner implements Runnable{ private BulkRequestBuilder bulk; private S3RiverFeedDefinition feedDefinition; public S3Scanner(S3RiverFeedDefinition feedDefinition){ this.feedDefinition = feedDefinition; } @Override public void run(){ while (true){ if (closed){ return; } try{ if (isStarted()){ // Scan folder starting from last changes id, then record the new one. Long lastScanTime = getLastScanTimeFromRiver("_lastScanTime"); lastScanTime = scan(lastScanTime); updateRiver("_lastScanTime", lastScanTime); } else { logger.info("Amazon S3 River is disabled for {}", riverName().name()); } } catch (Exception e){ logger.warn("Error while indexing content from {}", feedDefinition.getBucket()); if (logger.isDebugEnabled()){ logger.debug("Exception for folder {} is {}", feedDefinition.getBucket(), e); e.printStackTrace(); } } try { if (logger.isDebugEnabled()){ logger.debug("Amazon S3 river is going to sleep for {} ms", feedDefinition.getUpdateRate()); } Thread.sleep(feedDefinition.getUpdateRate()); } catch (InterruptedException ie){ } } } private boolean isStarted(){ // Refresh index before querying it. client.admin().indices().prepareRefresh("_river").execute().actionGet(); GetResponse isStartedGetResponse = client.prepareGet("_river", riverName().name(), "_s3status").execute().actionGet(); try{ if (!isStartedGetResponse.isExists()){ XContentBuilder xb = jsonBuilder().startObject() .startObject("amazon-s3") .field("feedname", feedDefinition.getFeedname()) .field("status", "STARTED").endObject() .endObject(); client.prepareIndex("_river", riverName.name(), "_s3status").setSource(xb).execute(); return true; } else { String status = (String)XContentMapValues.extractValue("amazon-s3.status", isStartedGetResponse.getSourceAsMap()); if ("STOPPED".equals(status)){ return false; } } } catch (Exception e){ logger.warn("failed to get status for " + riverName().name() + ", throttling....", e); } return true; } @SuppressWarnings("unchecked") private Long getLastScanTimeFromRiver(String lastScanTimeField){ Long result = null; try { // Do something. client.admin().indices().prepareRefresh("_river").execute().actionGet(); GetResponse lastSeqGetResponse = client.prepareGet("_river", riverName().name(), lastScanTimeField).execute().actionGet(); if (lastSeqGetResponse.isExists()) { Map<String, Object> fsState = (Map<String, Object>) lastSeqGetResponse.getSourceAsMap().get("amazon-s3"); if (fsState != null){ Object lastScanTime= fsState.get(lastScanTimeField); if (lastScanTime != null){ try{ result = Long.parseLong(lastScanTime.toString()); } catch (NumberFormatException nfe){ logger.warn("Last recorded lastScanTime is not a Long {}", lastScanTime.toString()); } } } } else { // This is first call, just log in debug mode. if (logger.isDebugEnabled()){ logger.debug("{} doesn't exist", lastScanTimeField); } } } catch (Exception e) { logger.warn("failed to get _lastScanTimeField, throttling....", e); } if (logger.isDebugEnabled()){ logger.debug("lastScanTimeField: {}", result); } return result; } /** Scan the Amazon S3 bucket for last changes. */ private Long scan(Long lastScanTime) throws Exception{ if (logger.isDebugEnabled()){ logger.debug("Starting scanning of bucket {} since {}", feedDefinition.getBucket(), lastScanTime); } S3ObjectSummaries summaries = s3.getObjectSummaries(lastScanTime); // Store now already indexed ids. List<String> previousFileIds = getAlreadyIndexFileIds(); // Browse change and checks if its indexable before starting. for (S3ObjectSummary summary : summaries.getPickedSummaries()){ if (S3RiverUtil.isIndexable(summary.getKey(), feedDefinition.getIncludes(), feedDefinition.getExcludes())){ indexFile(summary); } } // Now, because we do not get changes but only present files, we should // compare previously indexed files with latest to extract deleted ones... // But before, we need to produce a list of index ids corresponding to S3 keys. List<String> summariesIds = new ArrayList<String>(); for (String key : summaries.getKeys()){ summariesIds.add(buildIndexIdFromS3Key(key)); } for (String previousFileId : previousFileIds){ if (!summariesIds.contains(previousFileId)){ esDelete(indexName, typeName, previousFileId); } } return summaries.getLastScanTime(); } /** Retrieve the ids of files already present into index. */ private List<String> getAlreadyIndexFileIds(){ List<String> fileIds = new ArrayList<String>(); // TODO : Should be later optimized for only retrieving ids and getting // over the 5000 hits limitation. SearchResponse response = client .prepareSearch(indexName) .setSearchType(SearchType.QUERY_AND_FETCH) .setTypes(typeName) .setFrom(0) .setSize(5000) .execute().actionGet(); if (response.getHits() != null && response.getHits().getHits() != null){ for (SearchHit hit : response.getHits().getHits()){ fileIds.add(hit.getId()); } } return fileIds; } /** Index an Amazon S3 file by retrieving its content and building the suitable Json content. */ private String indexFile(S3ObjectSummary summary){ if (logger.isDebugEnabled()){ logger.debug("Trying to index '{}'", summary.getKey()); } try{ // Build a unique id from S3 unique summary key. String fileId = buildIndexIdFromS3Key(summary.getKey()); if (feedDefinition.isJsonSupport()){ esIndex(indexName, typeName, summary.getKey(), s3.getContent(summary)); } else { byte[] fileContent = s3.getContent(summary); if (fileContent != null) { // Compute number of chars to index. // see https://github.com/lbroudoux/es-amazon-s3-river/issues/36 int indexedChars = 100000; if (feedDefinition.getIndexedCharsRatio() > 0) { indexedChars = (int) Math.round(fileContent.length * feedDefinition.getIndexedCharsRatio()); } // Parse content using Tika directly. Metadata fileMetadata = new Metadata(); String parsedContent = TikaHolder.tika().parseToString( new BytesStreamInput(fileContent), fileMetadata, indexedChars); // Store Tika metadatas into a map. Map<String, Object> fileMetadataMap = new HashMap<String, Object>(); for (String key : fileMetadata.names()) { fileMetadataMap.put(key, fileMetadata.get(key)); } esIndex(indexName, typeName, fileId, jsonBuilder() .startObject() .field(S3RiverUtil.DOC_FIELD_TITLE, summary.getKey().substring(summary.getKey().lastIndexOf('/') + 1)) .field(S3RiverUtil.DOC_FIELD_MODIFIED_DATE, summary.getLastModified().getTime()) .field(S3RiverUtil.DOC_FIELD_SOURCE_URL, s3.getDownloadUrl(summary, feedDefinition)) .field(S3RiverUtil.DOC_FIELD_METADATA, s3.getS3UserMetadata(summary.getKey())) .startObject("file") .field("_name", summary.getKey().substring(summary.getKey().lastIndexOf('/') + 1)) .field("title", summary.getKey().substring(summary.getKey().lastIndexOf('/') + 1)) .field("file", parsedContent) .field("metadata", fileMetadataMap) .endObject() .endObject() ); return fileId; } } } catch (Exception e) { logger.warn("Can not index " + summary.getKey() + " : " + e.getMessage()); } return null; } /** Build a unique id from S3 unique summary key. */ private String buildIndexIdFromS3Key(String key){ return key.replace('/', '-'); } /** Update river last changes id value.*/ private void updateRiver(String lastScanTimeField, Long lastScanTime) throws Exception{ if (logger.isDebugEnabled()){ logger.debug("Updating lastScanTimeField: {}", lastScanTime); } // We store the lastupdate date and some stats XContentBuilder xb = jsonBuilder() .startObject() .startObject("amazon-s3") .field("feedname", feedDefinition.getFeedname()) .field(lastScanTimeField, lastScanTime) .endObject() .endObject(); esIndex("_river", riverName.name(), lastScanTimeField, xb); } /** Add to bulk an IndexRequest. */ private void esIndex(String index, String type, String id, XContentBuilder xb) throws Exception{ if (logger.isDebugEnabled()){ logger.debug("Indexing in ES " + index + ", " + type + ", " + id); } if (logger.isTraceEnabled()){ logger.trace("Json indexed : {}", xb.string()); } bulkProcessor.add(client.prepareIndex(index, type, id).setSource(xb).request()); } /** Add to bulk an IndexRequest. */ private void esIndex(String index, String type, String id, byte[] json) throws Exception{ if (logger.isDebugEnabled()){ logger.debug("Indexing in ES " + index + ", " + type + ", " + id); } if (logger.isTraceEnabled()){ logger.trace("Json indexed : {}", json); } bulkProcessor.add(client.prepareIndex(index, type, id).setSource(json).request()); } /** Add to bulk a DeleteRequest. */ private void esDelete(String index, String type, String id) throws Exception{ if (logger.isDebugEnabled()){ logger.debug("Deleting from ES " + index + ", " + type + ", " + id); } bulkProcessor.add(client.prepareDelete(index, type, id).request()); } } private enum RiverStatus { UNKNOWN, INITIALIZED, STARTING, RUNNING, STOPPING, STOPPED; } }