/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.aws.s3; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.Map; import org.apache.commons.io.IOUtils; import org.apache.storm.metric.api.MultiCountMetric; import org.apache.storm.task.OutputCollector; import org.apache.storm.task.TopologyContext; import org.apache.storm.topology.OutputFieldsDeclarer; import org.apache.storm.tuple.Fields; import org.apache.storm.tuple.Tuple; import org.apache.storm.tuple.Values; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.amazonaws.services.s3.model.AmazonS3Exception; import com.amazonaws.services.s3.model.S3Object; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.util.ConfUtils; /** * Checks whether a URL can be found in the cache, if not delegate to the * following bolt e.g. Fetcher, which gets bypassed otherwise. Does not enforce * any politeness. The credentials must be stored in ~/.aws/credentials **/ @SuppressWarnings("serial") public class S3CacheChecker extends AbstractS3CacheBolt { public static final Logger LOG = LoggerFactory .getLogger(S3CacheChecker.class); public static final String CACHE_STREAM = "cached"; @Override public void prepare(Map conf, TopologyContext context, OutputCollector collector) { super.prepare(conf, context, collector); bucketName = ConfUtils.getString(conf, BUCKET); boolean bucketExists = client.doesBucketExist(bucketName); if (!bucketExists) { String message = "Bucket " + bucketName + " does not exist"; throw new RuntimeException(message); } this.eventCounter = context.registerMetric("s3cache_counter", new MultiCountMetric(), 10); } @Override public void execute(Tuple tuple) { String url = tuple.getStringByField("url"); Metadata metadata = (Metadata) tuple.getValueByField("metadata"); // normalises URL String key = ""; try { key = URLEncoder.encode(url, "UTF-8"); } catch (UnsupportedEncodingException e) { // ignore it - we know UTF-8 is valid } // check size of the key if (key.length() >= 1024) { LOG.info("Key too large : {}", key); eventCounter.scope("result_keytoobig").incrBy(1); _collector.emit(tuple, new Values(url, metadata)); // ack it no matter what _collector.ack(tuple); return; } long preCacheQueryTime = System.currentTimeMillis(); S3Object obj = null; try { obj = client.getObject(bucketName, key); } catch (AmazonS3Exception e) { eventCounter.scope("result_misses").incrBy(1); // key does not exist? // no need for logging } long postCacheQueryTime = System.currentTimeMillis(); LOG.debug("Queried S3 cache in {} msec", (postCacheQueryTime - preCacheQueryTime)); if (obj != null) { try { byte[] content = IOUtils.toByteArray(obj.getObjectContent()); eventCounter.scope("result_hits").incrBy(1); eventCounter.scope("bytes_fetched").incrBy(content.length); metadata.setValue(INCACHE, "true"); _collector.emit(CACHE_STREAM, tuple, new Values(url, content, metadata)); _collector.ack(tuple); return; } catch (Exception e) { eventCounter.scope("result.exception").incrBy(1); LOG.error("IOException when extracting byte array", e); } } _collector.emit(tuple, new Values(url, metadata)); _collector.ack(tuple); } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(new Fields("url", "metadata")); declarer.declareStream(CACHE_STREAM, new Fields("url", "content", "metadata")); } }