/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.spout; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.Date; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.PriorityQueue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.util.StringTabScheme; import org.apache.storm.metric.api.IMetric; import org.apache.storm.spout.SpoutOutputCollector; import org.apache.storm.task.TopologyContext; import org.apache.storm.topology.OutputFieldsDeclarer; import org.apache.storm.topology.base.BaseRichSpout; /** * Stores URLs in memory. Useful for testing and debugging in local mode or with * a single worker. */ @SuppressWarnings("serial") public class MemorySpout extends BaseRichSpout { private static final Logger LOG = LoggerFactory .getLogger(MemorySpout.class); private SpoutOutputCollector _collector; private StringTabScheme scheme = new StringTabScheme(); private boolean active = true; private static PriorityQueue<ScheduledURL> queue = new PriorityQueue<>(); private String[] startingURLs; public MemorySpout(String... urls) { startingURLs = urls; } /** * Add a new URL * * @param nextFetch **/ public static void add(String url, Metadata md, Date nextFetch) { LOG.debug("Adding {} with md {} and nextFetch {}", url, md, nextFetch); ScheduledURL tuple = new ScheduledURL(url, md, nextFetch); synchronized (queue) { queue.add(tuple); } } @Override public void open(@SuppressWarnings("rawtypes") Map conf, TopologyContext context, SpoutOutputCollector collector) { _collector = collector; // check that there is only one instance of it int totalTasks = context .getComponentTasks(context.getThisComponentId()).size(); if (totalTasks > 1) { throw new RuntimeException( "Can't have more than one instance of the MemorySpout"); } Date now = new Date(); for (String u : startingURLs) { LOG.debug("About to deserialize {} ", u); List<Object> tuple = scheme.deserialize(ByteBuffer.wrap(u .getBytes(StandardCharsets.UTF_8))); add((String) tuple.get(0), (Metadata) tuple.get(1), now); } context.registerMetric("queue_size", new IMetric() { @Override public Object getValueAndReset() { return queue.size(); } }, 10); } @Override public void nextTuple() { if (!active) return; synchronized (queue) { // removes the URL ScheduledURL tuple = queue.poll(); if (tuple == null) return; // check whether it is due for fetching if (tuple.nextFetchDate.after(new Date())) { LOG.debug("Tuple {} not ready for fetching", tuple.URL); // put it back and wait queue.add(tuple); return; } List<Object> tobs = new LinkedList<>(); tobs.add(tuple.URL); tobs.add(tuple.m); _collector.emit(tobs, tuple.URL); } } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(scheme.getOutputFields()); } @Override public void activate() { super.activate(); active = true; } @Override public void deactivate() { super.deactivate(); active = false; } } class ScheduledURL implements Comparable<ScheduledURL> { Date nextFetchDate; String URL; Metadata m; ScheduledURL(String URL, Metadata m, Date nextFetchDate) { this.nextFetchDate = nextFetchDate; this.URL = URL; this.m = m; } @Override /** Sort by next fetch date then URl **/ public int compareTo(ScheduledURL o) { // compare the URL int compString = URL.compareTo(o.URL); if (compString == 0) return 0; // compare the date int comp = nextFetchDate.compareTo(o.nextFetchDate); if (comp != 0) return comp; return compString; } }