/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.stormcrawler.aws.bolt;
import static com.digitalpebble.stormcrawler.Constants.StatusStreamName;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.amazonaws.regions.RegionUtils;
import com.amazonaws.services.cloudsearchdomain.AmazonCloudSearchDomainClient;
import com.amazonaws.services.cloudsearchdomain.model.ContentType;
import com.amazonaws.services.cloudsearchdomain.model.DocumentServiceWarning;
import com.amazonaws.services.cloudsearchdomain.model.UploadDocumentsRequest;
import com.amazonaws.services.cloudsearchdomain.model.UploadDocumentsResult;
import com.amazonaws.services.cloudsearchv2.AmazonCloudSearchClient;
import com.amazonaws.services.cloudsearchv2.model.DescribeDomainsRequest;
import com.amazonaws.services.cloudsearchv2.model.DescribeDomainsResult;
import com.amazonaws.services.cloudsearchv2.model.DescribeIndexFieldsRequest;
import com.amazonaws.services.cloudsearchv2.model.DescribeIndexFieldsResult;
import com.amazonaws.services.cloudsearchv2.model.DomainStatus;
import com.amazonaws.services.cloudsearchv2.model.IndexFieldStatus;
import com.amazonaws.util.json.JSONException;
import com.amazonaws.util.json.JSONObject;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.indexing.AbstractIndexerBolt;
import com.digitalpebble.stormcrawler.persistence.Status;
import com.digitalpebble.stormcrawler.util.ConfUtils;
import org.apache.storm.Config;
import org.apache.storm.metric.api.MultiCountMetric;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.apache.storm.utils.TupleUtils;
/**
* Writes documents to CloudSearch.
*/
@SuppressWarnings("serial")
public class CloudSearchIndexerBolt extends AbstractIndexerBolt {
public static final Logger LOG = LoggerFactory
.getLogger(CloudSearchIndexerBolt.class);
private static final int MAX_SIZE_BATCH_BYTES = 5242880;
private static final int MAX_SIZE_DOC_BYTES = 1048576;
private static final SimpleDateFormat DATE_FORMAT = new SimpleDateFormat(
"yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
private AmazonCloudSearchDomainClient client;
private int maxDocsInBatch = -1;
private StringBuffer buffer;
private int numDocsInBatch = 0;
/** Max amount of time wait before indexing **/
private int maxTimeBuffered = 10;
private boolean dumpBatchFilesToTemp = false;
private OutputCollector _collector;
private MultiCountMetric eventCounter;
private Map<String, String> csfields = new HashMap<>();
private long timeLastBatchSent = System.currentTimeMillis();
private List<Tuple> unacked = new ArrayList<>();
@SuppressWarnings({ "rawtypes", "unchecked" })
@Override
public void prepare(Map conf, TopologyContext context,
OutputCollector collector) {
super.prepare(conf, context, collector);
_collector = collector;
this.eventCounter = context.registerMetric("CloudSearchIndexer",
new MultiCountMetric(), 10);
maxTimeBuffered = ConfUtils.getInt(conf,
CloudSearchConstants.MAX_TIME_BUFFERED, 10);
maxDocsInBatch = ConfUtils.getInt(conf,
CloudSearchConstants.MAX_DOCS_BATCH, -1);
buffer = new StringBuffer(MAX_SIZE_BATCH_BYTES).append('[');
dumpBatchFilesToTemp = ConfUtils.getBoolean(conf,
"cloudsearch.batch.dump", false);
if (dumpBatchFilesToTemp) {
// only dumping to local file
// no more config required
return;
}
String endpoint = ConfUtils.getString(conf, "cloudsearch.endpoint");
if (StringUtils.isBlank(endpoint)) {
String message = "Missing CloudSearch endpoint";
LOG.error(message);
throw new RuntimeException(message);
}
String regionName = ConfUtils.getString(conf,
CloudSearchConstants.REGION);
AmazonCloudSearchClient cl = new AmazonCloudSearchClient();
if (StringUtils.isNotBlank(regionName)) {
cl.setRegion(RegionUtils.getRegion(regionName));
}
String domainName = null;
// retrieve the domain name
DescribeDomainsResult domains = cl
.describeDomains(new DescribeDomainsRequest());
Iterator<DomainStatus> dsiter = domains.getDomainStatusList()
.iterator();
while (dsiter.hasNext()) {
DomainStatus ds = dsiter.next();
if (ds.getDocService().getEndpoint().equals(endpoint)) {
domainName = ds.getDomainName();
break;
}
}
// check domain name
if (StringUtils.isBlank(domainName)) {
throw new RuntimeException(
"No domain name found for CloudSearch endpoint");
}
DescribeIndexFieldsResult indexDescription = cl
.describeIndexFields(new DescribeIndexFieldsRequest()
.withDomainName(domainName));
for (IndexFieldStatus ifs : indexDescription.getIndexFields()) {
String indexname = ifs.getOptions().getIndexFieldName();
String indextype = ifs.getOptions().getIndexFieldType();
LOG.info("CloudSearch index name {} of type {}", indexname,
indextype);
csfields.put(indexname, indextype);
}
client = new AmazonCloudSearchDomainClient();
client.setEndpoint(endpoint);
}
@Override
public void execute(Tuple tuple) {
if (TupleUtils.isTick(tuple)) {
// check when we last sent a batch
long now = System.currentTimeMillis();
long gap = now - timeLastBatchSent;
if (gap >= maxTimeBuffered * 1000) {
sendBatch();
}
_collector.ack(tuple);
return;
}
String url = tuple.getStringByField("url");
// Distinguish the value used for indexing
// from the one used for the status
String normalisedurl = valueForURL(tuple);
Metadata metadata = (Metadata) tuple.getValueByField("metadata");
String text = tuple.getStringByField("text");
boolean keep = filterDocument(metadata);
if (!keep) {
eventCounter.scope("Filtered").incrBy(1);
// treat it as successfully processed even if
// we do not index it
_collector.emit(StatusStreamName, tuple, new Values(url, metadata,
Status.FETCHED));
_collector.ack(tuple);
return;
}
try {
JSONObject doc_builder = new JSONObject();
doc_builder.put("type", "add");
// generate the id from the normalised url
String ID = CloudSearchUtils.getID(normalisedurl);
doc_builder.put("id", ID);
JSONObject fields = new JSONObject();
// which metadata to include as fields
Map<String, String[]> keyVals = filterMetadata(metadata);
for (final Entry<String, String[]> e : keyVals.entrySet()) {
String fieldname = CloudSearchUtils.cleanFieldName(e.getKey());
String type = csfields.get(fieldname);
// undefined in index
if (type == null && !this.dumpBatchFilesToTemp) {
LOG.info(
"Field {} not defined in CloudSearch domain for {} - skipping.",
fieldname, url);
continue;
}
String[] values = e.getValue();
// check that there aren't multiple values if not defined so in
// the index
if (values.length > 1
&& !StringUtils.containsIgnoreCase(type, "-array")) {
LOG.info(
"{} values found for field {} of type {} - keeping only the first one. {}",
values.length, fieldname, type, url);
values = new String[] { values[0] };
}
// write the values
for (String value : values) {
// Check that the date format is correct
if (StringUtils.containsIgnoreCase(type, "date")) {
try {
DATE_FORMAT.parse(value);
} catch (ParseException pe) {
LOG.info("Unparsable date {}", value);
continue;
}
}
// normalise strings
else {
value = CloudSearchUtils.stripNonCharCodepoints(value);
}
fields.accumulate(fieldname, value);
}
}
// include the url ?
String fieldNameForURL = fieldNameForURL();
if (StringUtils.isNotBlank(fieldNameForURL)) {
fieldNameForURL = CloudSearchUtils
.cleanFieldName(fieldNameForURL);
if (this.dumpBatchFilesToTemp
|| csfields.get(fieldNameForURL) != null) {
String _url = CloudSearchUtils
.stripNonCharCodepoints(normalisedurl);
fields.put(fieldNameForURL, _url);
}
}
// include the text ?
String fieldNameForText = fieldNameForText();
if (StringUtils.isNotBlank(fieldNameForText)) {
fieldNameForText = CloudSearchUtils
.cleanFieldName(fieldNameForText);
if (this.dumpBatchFilesToTemp
|| csfields.get(fieldNameForText) != null) {
text = CloudSearchUtils.stripNonCharCodepoints(text);
fields.put(fieldNameForText, text);
}
}
doc_builder.put("fields", fields);
addToBatch(doc_builder.toString(2), url, tuple);
} catch (JSONException e) {
LOG.error("Exception caught while building JSON object", e);
// resending would produce the same results no point in retrying
_collector.emit(StatusStreamName, tuple, new Values(url, metadata,
Status.ERROR));
_collector.ack(tuple);
}
}
private void addToBatch(String currentDoc, String url, Tuple tuple) {
int currentDocLength = currentDoc.getBytes(StandardCharsets.UTF_8).length;
// check that the doc is not too large -> skip it if it does
if (currentDocLength > MAX_SIZE_DOC_BYTES) {
LOG.error("Doc too large. currentDoc.length {} : {}",
currentDocLength, url);
return;
}
int currentBufferLength = buffer.toString().getBytes(
StandardCharsets.UTF_8).length;
LOG.debug("currentDoc.length {}, buffer length {}", currentDocLength,
currentBufferLength);
// can add it to the buffer without overflowing?
if (currentDocLength + 2 + currentBufferLength < MAX_SIZE_BATCH_BYTES) {
if (numDocsInBatch != 0)
buffer.append(',');
buffer.append(currentDoc);
this.unacked.add(tuple);
numDocsInBatch++;
}
// flush the previous batch and create a new one with this doc
else {
sendBatch();
buffer.append(currentDoc);
this.unacked.add(tuple);
numDocsInBatch++;
}
// have we reached the max number of docs in a batch after adding
// this doc?
if (maxDocsInBatch > 0 && numDocsInBatch == maxDocsInBatch) {
sendBatch();
}
}
public void sendBatch() {
timeLastBatchSent = System.currentTimeMillis();
// nothing to do
if (numDocsInBatch == 0) {
return;
}
// close the array
buffer.append(']');
LOG.info("Sending {} docs to CloudSearch", numDocsInBatch);
byte[] bb = buffer.toString().getBytes(StandardCharsets.UTF_8);
if (dumpBatchFilesToTemp) {
try {
File temp = File.createTempFile("CloudSearch_", ".json");
FileUtils.writeByteArrayToFile(temp, bb);
LOG.info("Wrote batch file {}", temp.getName());
// ack the tuples
for (Tuple t : unacked) {
String url = t.getStringByField("url");
Metadata metadata = (Metadata) t
.getValueByField("metadata");
_collector.emit(StatusStreamName, t, new Values(url,
metadata, Status.FETCHED));
_collector.ack(t);
}
unacked.clear();
} catch (IOException e1) {
LOG.error("Exception while generating batch file", e1);
// fail the tuples
for (Tuple t : unacked) {
_collector.fail(t);
}
unacked.clear();
} finally {
// reset buffer and doc counter
buffer = new StringBuffer(MAX_SIZE_BATCH_BYTES).append('[');
numDocsInBatch = 0;
}
return;
}
// not in debug mode
try (InputStream inputStream = new ByteArrayInputStream(bb)) {
UploadDocumentsRequest batch = new UploadDocumentsRequest();
batch.setContentLength((long) bb.length);
batch.setContentType(ContentType.Applicationjson);
batch.setDocuments(inputStream);
UploadDocumentsResult result = client.uploadDocuments(batch);
LOG.info(result.getStatus());
for (DocumentServiceWarning warning : result.getWarnings()) {
LOG.info(warning.getMessage());
}
if (!result.getWarnings().isEmpty()) {
eventCounter.scope("Warnings").incrBy(
result.getWarnings().size());
}
eventCounter.scope("Added").incrBy(result.getAdds());
// ack the tuples
for (Tuple t : unacked) {
String url = t.getStringByField("url");
Metadata metadata = (Metadata) t.getValueByField("metadata");
_collector.emit(StatusStreamName, t, new Values(url, metadata,
Status.FETCHED));
_collector.ack(t);
}
unacked.clear();
} catch (Exception e) {
LOG.error("Exception while sending batch", e);
LOG.error(buffer.toString());
// fail the tuples
for (Tuple t : unacked) {
_collector.fail(t);
}
unacked.clear();
} finally {
// reset buffer and doc counter
buffer = new StringBuffer(MAX_SIZE_BATCH_BYTES).append('[');
numDocsInBatch = 0;
}
}
@Override
public void cleanup() {
// This will flush any unsent documents.
sendBatch();
client.shutdown();
}
@Override
public Map<String, Object> getComponentConfiguration() {
Config conf = new Config();
conf.put(Config.TOPOLOGY_TICK_TUPLE_FREQ_SECS, 1);
return conf;
}
}