/**
* Licensed to DigitalPebble Ltd under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* DigitalPebble licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.digitalpebble.storm.crawler.elasticsearch.persistence;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Map;
import java.util.Queue;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.Client;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHits;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import backtype.storm.spout.SpoutOutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichSpout;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Values;
import com.digitalpebble.storm.crawler.Metadata;
import com.digitalpebble.storm.crawler.elasticsearch.ElasticSearchConnection;
import com.digitalpebble.storm.crawler.util.ConfUtils;
import com.digitalpebble.storm.crawler.util.URLPartitioner;
/**
* Overly simplistic spout implementation which pulls URL from an ES index.
* Doesn't do anything about data locality or sharding.
* **/
public class ElasticSearchSpout extends BaseRichSpout {
private static final Logger LOG = LoggerFactory
.getLogger(ElasticSearchSpout.class);
private static final String ESBoltType = "status";
private static final String ESStatusIndexNameParamName = "es.status.index.name";
private static final String ESStatusDocTypeParamName = "es.status.doc.type";
private static final String ESStatusMaxInflightParamName = "es.status.max.inflight.urls.per.bucket";
private String indexName;
private String docType;
private SpoutOutputCollector _collector;
private Client client;
private final int bufferSize = 100;
private Queue<Values> buffer = new LinkedList<Values>();
private int lastStartOffset = 0;
private URLPartitioner partitioner;
private int maxInFlightURLsPerBucket = -1;
/** Keeps a count of the URLs being processed per host/domain/IP **/
private Map<String, Integer> inFlightTracker = new HashMap<String, Integer>();
// URL / politeness bucket (hostname / domain etc...)
private Map<String, String> beingProcessed = new HashMap<String, String>();
@Override
public void open(Map stormConf, TopologyContext context,
SpoutOutputCollector collector) {
indexName = ConfUtils.getString(stormConf, ESStatusIndexNameParamName,
"status");
docType = ConfUtils.getString(stormConf, ESStatusDocTypeParamName,
"status");
maxInFlightURLsPerBucket = ConfUtils.getInt(stormConf,
ESStatusMaxInflightParamName, 1);
try {
client = ElasticSearchConnection.getClient(stormConf, ESBoltType);
} catch (Exception e1) {
LOG.error("Can't connect to ElasticSearch", e1);
throw new RuntimeException(e1);
}
partitioner = new URLPartitioner();
partitioner.configure(stormConf);
_collector = collector;
}
@Override
public void close() {
if (client != null)
client.close();
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
declarer.declare(new Fields("url", "metadata"));
}
@Override
public void nextTuple() {
// have anything in the buffer?
if (!buffer.isEmpty()) {
Values fields = buffer.remove();
String url = fields.get(0).toString();
Metadata metadata = (Metadata) fields.get(1);
String partitionKey = partitioner.getPartition(url, metadata);
// check whether we already have too tuples in flight for this
// partition key
if (maxInFlightURLsPerBucket != -1) {
Integer inflightforthiskey = inFlightTracker.get(partitionKey);
if (inflightforthiskey == null)
inflightforthiskey = new Integer(0);
if (inflightforthiskey.intValue() >= maxInFlightURLsPerBucket) {
// do it later! left it out of the queue for now
return;
}
int currentCount = inflightforthiskey.intValue();
inFlightTracker.put(partitionKey, ++currentCount);
}
beingProcessed.put(url, partitionKey);
this._collector.emit(fields, url);
return;
}
// re-populate the buffer
populateBuffer();
}
/** run a query on ES to populate the internal buffer **/
private void populateBuffer() {
// TODO cap the number of results per shard
// assuming that the sharding of status URLs is done
// based on the hostname domain or anything else
// which is useful for politeness
// TODO cap the results per host or domain
Date now = new Date();
// TODO use scrolls instead?
// @see
// http://www.elasticsearch.org/guide/en/elasticsearch/client/java-api/current/search.html#scrolling
SearchResponse response = client
.prepareSearch(indexName)
.setTypes(docType)
.setSearchType(SearchType.DFS_QUERY_THEN_FETCH)
.setQuery(QueryBuilders.rangeQuery("nextFetchDate").lte(now))
// .setPostFilter(
// FilterBuilders.rangeFilter("age").from(12).to(18))
.setFrom(lastStartOffset).setSize(this.bufferSize)
.setExplain(false).execute().actionGet();
SearchHits hits = response.getHits();
int numhits = hits.getHits().length;
// no more results?
if (numhits == 0)
lastStartOffset = 0;
else
lastStartOffset += numhits;
// filter results so that we don't include URLs we are already
// being processed or skip those for which we already have enough
//
for (int i = 0; i < hits.getHits().length; i++) {
Map<String, Object> keyValues = hits.getHits()[i].sourceAsMap();
String url = (String) keyValues.get("url");
// is already being processed - skip it!
if (beingProcessed.containsKey(url))
continue;
String mdAsString = (String) keyValues.get("metadata");
Metadata metadata = new Metadata();
if (mdAsString != null) {
// parse the string and generate the MD accordingly
// url.path: http://www.lemonde.fr/
// depth: 1
String[] kvs = mdAsString.split("\n");
for (String pair : kvs) {
String[] kv = pair.split(": ");
if (kv.length != 2) {
LOG.info("Invalid key value pair {}", pair);
continue;
}
metadata.addValue(kv[0], kv[1]);
}
}
buffer.add(new Values(url, metadata));
}
}
@Override
public void ack(Object msgId) {
super.ack(msgId);
String partitionKey = beingProcessed.remove(msgId);
decrementPartitionKey(partitionKey);
}
@Override
public void fail(Object msgId) {
super.fail(msgId);
String partitionKey = beingProcessed.remove(msgId);
decrementPartitionKey(partitionKey);
}
private void decrementPartitionKey(String partitionKey) {
if (partitionKey == null)
return;
Integer currentValue = this.inFlightTracker.get(partitionKey);
if (currentValue == null)
return;
int currentVal = currentValue.intValue();
currentVal--;
this.inFlightTracker.put(partitionKey, currentVal);
}
}