/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.solr.persistence; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.Map; import java.util.Queue; import org.apache.commons.lang.StringUtils; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.solr.SolrConnection; import com.digitalpebble.stormcrawler.util.ConfUtils; import com.digitalpebble.stormcrawler.util.URLPartitioner; import org.apache.storm.spout.SpoutOutputCollector; import org.apache.storm.task.TopologyContext; import org.apache.storm.topology.OutputFieldsDeclarer; import org.apache.storm.topology.base.BaseRichSpout; import org.apache.storm.tuple.Fields; import org.apache.storm.tuple.Values; public class SolrSpout extends BaseRichSpout { private static final Logger LOG = LoggerFactory.getLogger(SolrSpout.class); private static final String BOLT_TYPE = "status"; private static final String SolrMaxInflightParam = "solr.status.max.inflight.urls.per.bucket"; private static final String SolrDiversityFieldParam = "solr.status.bucket.field"; private static final String SolrDiversityBucketParam = "solr.status.bucket.maxsize"; private static final String SolrMetadataPrefix = "solr.status.metadata.prefix"; private SpoutOutputCollector _collector; private SolrConnection connection; private final int bufferSize = 100; private Queue<Values> buffer = new LinkedList<>(); private int lastStartOffset = 0; private URLPartitioner partitioner; private int maxInFlightURLsPerBucket = -1; private String diversityField = null; private int diversityBucketSize = 0; private String mdPrefix; /** Keeps a count of the URLs being processed per host/domain/IP **/ private Map<String, Integer> inFlightTracker = new HashMap<>(); // URL / politeness bucket (hostname / domain etc...) private Map<String, String> beingProcessed = new HashMap<>(); @Override public void open(Map stormConf, TopologyContext context, SpoutOutputCollector collector) { // This implementation works only where there is a single instance // of the spout. Having more than one instance means that they would run // the same queries and send the same tuples down the topology. int totalTasks = context .getComponentTasks(context.getThisComponentId()).size(); if (totalTasks > 1) { throw new RuntimeException( "Can't have more than one instance of SOLRSpout"); } maxInFlightURLsPerBucket = ConfUtils.getInt(stormConf, SolrMaxInflightParam, 1); diversityField = ConfUtils .getString(stormConf, SolrDiversityFieldParam); diversityBucketSize = ConfUtils.getInt(stormConf, SolrDiversityBucketParam, 100); mdPrefix = ConfUtils.getString(stormConf, SolrMetadataPrefix, "metadata"); try { connection = SolrConnection.getConnection(stormConf, BOLT_TYPE); } catch (Exception e) { LOG.error("Can't connect to Solr: {}", e); throw new RuntimeException(e); } partitioner = new URLPartitioner(); partitioner.configure(stormConf); _collector = collector; } @Override public void close() { if (connection != null) { try { connection.close(); } catch (Exception e) { LOG.error("Can't close connection to Solr: {}", e); } } } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(new Fields("url", "metadata")); } @Override public void nextTuple() { // have anything in the buffer? if (!buffer.isEmpty()) { Values fields = buffer.remove(); String url = fields.get(0).toString(); Metadata metadata = (Metadata) fields.get(1); String partitionKey = partitioner.getPartition(url, metadata); // check whether we already have too tuples in flight for this // partition key if (maxInFlightURLsPerBucket != -1) { Integer inflightforthiskey = inFlightTracker.get(partitionKey); if (inflightforthiskey == null) inflightforthiskey = new Integer(0); if (inflightforthiskey.intValue() >= maxInFlightURLsPerBucket) { // do it later! left it out of the queue for now return; } int currentCount = inflightforthiskey.intValue(); inFlightTracker.put(partitionKey, ++currentCount); } beingProcessed.put(url, partitionKey); this._collector.emit(fields, url); return; } // re-populate the buffer populateBuffer(); } private void populateBuffer() { // TODO Sames as the ElasticSearchSpout? // TODO Use the cursor feature? // https://cwiki.apache.org/confluence/display/solr/Pagination+of+Results SolrQuery query = new SolrQuery(); query.setQuery("*:*").addFilterQuery("nextFetchDate:[* TO NOW]") .setStart(lastStartOffset).setRows(this.bufferSize); if (StringUtils.isNotBlank(diversityField)) { query.addFilterQuery(String.format("{!collapse field=%s}", diversityField)); query.set("expand", "true").set("expand.rows", diversityBucketSize); } try { QueryResponse response = connection.getClient().query(query); SolrDocumentList docs = new SolrDocumentList(); if (StringUtils.isNotBlank(diversityField)) { // Add the main documents collapsed by the CollapsingQParser // plugin docs.addAll(response.getResults()); Map<String, SolrDocumentList> expandedResults = response .getExpandedResults(); for (String key : expandedResults.keySet()) { docs.addAll(expandedResults.get(key)); } } else { docs = response.getResults(); } int numhits = response.getResults().size(); // no more results? if (numhits == 0) lastStartOffset = 0; else lastStartOffset += numhits; String prefix = mdPrefix.concat("."); for (SolrDocument doc : docs) { String url = (String) doc.get("url"); // is already being processed - skip it! if (beingProcessed.containsKey(url)) continue; Metadata metadata = new Metadata(); Iterator<String> keyIterators = doc.getFieldNames().iterator(); while (keyIterators.hasNext()) { String key = keyIterators.next(); if (key.startsWith(prefix)) { Collection<Object> values = doc.getFieldValues(key); key = StringUtils.replace(key, prefix, "", 1); Iterator<Object> valueIterator = values.iterator(); while (valueIterator.hasNext()) { String value = (String) valueIterator.next(); metadata.addValue(key, value); } } } buffer.add(new Values(url, metadata)); } } catch (Exception e) { LOG.error("Can't query Solr: {}", e); } } @Override public void ack(Object msgId) { super.ack(msgId); String partitionKey = beingProcessed.remove(msgId); decrementPartitionKey(partitionKey); } @Override public void fail(Object msgId) { super.fail(msgId); String partitionKey = beingProcessed.remove(msgId); decrementPartitionKey(partitionKey); } private void decrementPartitionKey(String partitionKey) { if (partitionKey == null) return; Integer currentValue = this.inFlightTracker.get(partitionKey); if (currentValue == null) return; int currentVal = currentValue.intValue(); currentVal--; this.inFlightTracker.put(partitionKey, currentVal); } }