/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.stormcrawler.aws.s3;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.Map;
import org.apache.storm.metric.api.MultiCountMetric;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.amazonaws.services.s3.model.AmazonS3Exception;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.PutObjectResult;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.util.ConfUtils;
/**
* Stores binary content into Amazon S3. The credentials must be stored in
* ~/.aws/credentials
**/
@SuppressWarnings("serial")
public abstract class S3Cacher extends AbstractS3CacheBolt {
public static final Logger LOG = LoggerFactory.getLogger(S3Cacher.class);
protected abstract byte[] getContentToCache(Metadata metadata,
byte[] content, String url);
protected abstract String getKeyPrefix();
protected abstract String getMetricPrefix();
protected abstract boolean shouldOverwrite(Metadata metadata);
@Override
public void prepare(Map conf, TopologyContext context,
OutputCollector collector) {
super.prepare(conf, context, collector);
bucketName = ConfUtils.getString(conf, BUCKET);
boolean bucketExists = client.doesBucketExist(bucketName);
if (!bucketExists) {
String message = "Bucket " + bucketName + " does not exist";
throw new RuntimeException(message);
}
this.eventCounter = context.registerMetric(getMetricPrefix()
+ "s3cache_counter", new MultiCountMetric(), 10);
}
@Override
public void execute(Tuple tuple) {
// stores the binary content on S3
byte[] content = tuple.getBinaryByField("content");
String url = tuple.getStringByField("url");
final Metadata metadata = (Metadata) tuple.getValueByField("metadata");
// If there is no content
byte[] contentToCache = getContentToCache(metadata, content, url);
if (contentToCache == null) {
LOG.info("{} had no data to cache", url);
_collector.emit(tuple, new Values(url, content, metadata));
// ack it no matter what
_collector.ack(tuple);
return;
}
// already in the cache
// don't need to recache it
if (!shouldOverwrite(metadata)) {
eventCounter.scope("already_in_cache").incr();
_collector.emit(tuple, new Values(url, content, metadata));
// ack it no matter what
_collector.ack(tuple);
return;
}
// normalises URL
String key = "";
try {
key = URLEncoder.encode(url, "UTF-8");
} catch (UnsupportedEncodingException e) {
// ignore it - we know UTF-8 is valid
}
// check size of the key
if (key.length() >= 1024) {
LOG.info("Key too large : {}", key);
eventCounter.scope("key_too_large").incr();
_collector.emit(tuple, new Values(url, content, metadata));
// ack it no matter what
_collector.ack(tuple);
return;
}
ByteArrayInputStream input = new ByteArrayInputStream(contentToCache);
ObjectMetadata md = new ObjectMetadata();
md.setContentLength(contentToCache.length);
md.setHeader("x-amz-storage-class", "STANDARD_IA");
try {
PutObjectResult result = client.putObject(bucketName,
getKeyPrefix() + key, input, md);
eventCounter.scope("cached").incr();
// TODO check something with the result?
} catch (AmazonS3Exception exception) {
LOG.error("AmazonS3Exception while storing {}", url, exception);
eventCounter.scope("s3_exception").incr();
} finally {
try {
input.close();
} catch (IOException e) {
LOG.error("Error while closing ByteArrayInputStream", e);
}
}
_collector.emit(tuple, new Values(url, content, metadata));
// ack it no matter what
_collector.ack(tuple);
}
}