/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.streams.rss.provider;
import org.apache.streams.core.StreamsDatum;
import org.apache.streams.data.util.RFC3339Utils;
import org.apache.streams.rss.serializer.SyndEntrySerializer;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.google.common.annotations.VisibleForTesting;
import com.rometools.rome.feed.synd.SyndEntry;
import com.rometools.rome.feed.synd.SyndFeed;
import com.rometools.rome.io.FeedException;
import com.rometools.rome.io.SyndFeedInput;
import org.joda.time.DateTime;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.Collections;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
/**
* A {@link java.lang.Runnable} task that queues rss feed data.
*
* <p/>
* <code>RssStreamProviderTask</code> reads the content of an rss feed and queues the articles from
* the feed inform of a {@link com.fasterxml.jackson.databind.node.ObjectNode} wrapped in a {@link org.apache.streams.core.StreamsDatum}.
* The task can filter articles by a published date. If the task cannot parse the date of the article or the article does not contain a
* published date, by default the task will attempt to queue article.
*
* <p/>
* A task can be run in perpetual mode which will store the article urls in a static variable. The next time a
* <code>RssStreamProviderTask</code> is run, it will not queue data that was seen the previous time the rss feed was read.
* This is an attempt to reduce multiple copies of an article from being output by a
* {@link org.apache.streams.rss.provider.RssStreamProvider}.
*
* <p/>
* ** Warning! **
* It still is possible to output multiples of the same article. If multiple tasks executions for the same rss feed overlap
* in execution time, it possible that the previously seen articles static variable will not have been updated in time.
*
*/
public class RssStreamProviderTask implements Runnable {
private static final Logger LOGGER = LoggerFactory.getLogger(RssStreamProviderTask.class);
private static final int DEFAULT_TIME_OUT = 10000; // 10 seconds
private static final String RSS_KEY = "rssFeed";
private static final String URI_KEY = "uri";
private static final String LINK_KEY = "link";
private static final String DATE_KEY = "publishedDate";
/**
* Map that contains the Set of previously seen articles by an rss feed.
*/
@VisibleForTesting
protected Map<String, Set<String>> PREVIOUSLY_SEEN = new ConcurrentHashMap<>();
private BlockingQueue<StreamsDatum> dataQueue;
private String rssFeed;
private int timeOut;
private SyndEntrySerializer serializer;
private DateTime publishedSince;
private boolean perpetual;
/**
* Non-perpetual mode, no date filter, time out of 10 sec
* @see {@link org.apache.streams.rss.provider.RssStreamProviderTask
* #RssStreamProviderTask(java.util.concurrent.BlockingQueue, String, org.joda.time.DateTime, int, boolean)}
* @param queue queue
* @param rssFeed rssFeed
*/
public RssStreamProviderTask(BlockingQueue<StreamsDatum> queue, String rssFeed) {
this(queue, rssFeed, new DateTime().minusYears(30), DEFAULT_TIME_OUT, false);
}
/**
* Non-perpetual mode, no date filter.
* @see {@link org.apache.streams.rss.provider.RssStreamProviderTask
* #RssStreamProviderTask(java.util.concurrent.BlockingQueue, String, org.joda.time.DateTime, int, boolean)}
* @param queue queue
* @param rssFeed rssFeed
* @param timeOut timeOut
*/
public RssStreamProviderTask(BlockingQueue<StreamsDatum> queue, String rssFeed, int timeOut) {
this(queue, rssFeed, new DateTime().minusYears(30), timeOut, false);
}
/**
* Non-perpetual mode, time out of 10 sec
* @see {@link org.apache.streams.rss.provider.RssStreamProviderTask
* #RssStreamProviderTask(java.util.concurrent.BlockingQueue, String, org.joda.time.DateTime, int, boolean)}
* @param queue queue
* @param rssFeed rssFeed
* @param publishedSince publishedSince
*/
public RssStreamProviderTask(BlockingQueue<StreamsDatum> queue, String rssFeed, DateTime publishedSince) {
this(queue, rssFeed, publishedSince, DEFAULT_TIME_OUT, false);
}
/**
* RssStreamProviderTask that reads an rss feed url and queues the resulting articles as StreamsDatums with the documents
* being object nodes.
* @param queue Queue to push data to
* @param rssFeed url of rss feed to read
* @param publishedSince DateTime to filter articles by, will queue articles with published times after this
* @param timeOut url connection timeout in milliseconds
* @param perpetual true, if you want to run in perpetual mode. NOT RECOMMENDED
*/
public RssStreamProviderTask(BlockingQueue<StreamsDatum> queue, String rssFeed, DateTime publishedSince, int timeOut, boolean perpetual) {
this.dataQueue = queue;
this.rssFeed = rssFeed;
this.timeOut = timeOut;
this.publishedSince = publishedSince;
this.serializer = new SyndEntrySerializer();
this.perpetual = perpetual;
}
/**
* The rss feed url that this task is responsible for reading.
* @return rss feed url
*/
public String getRssFeed() {
return this.rssFeed;
}
@Override
public void run() {
try {
Set<String> batch = queueFeedEntries(new URL(this.rssFeed));
if (this.perpetual) {
PREVIOUSLY_SEEN.put(this.getRssFeed(), batch);
}
} catch (IOException | FeedException ex) {
LOGGER.warn("Exception while reading rss stream, {} : {}", this.rssFeed, ex);
}
}
/**
* Reads the url and queues the data
* @param feedUrl rss feed url
* @return set of all article urls that were read from the feed
* @throws IOException when it cannot connect to the url or the url is malformed
* @throws FeedException when it cannot reed the feed.
*/
@VisibleForTesting
protected Set<String> queueFeedEntries(URL feedUrl) throws IOException, FeedException {
// ConcurrentHashSet is preferable, but it's only in guava 15+
// spark 1.5.0 uses guava 14 so for the moment this is the workaround
Set<String> batch = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
URLConnection connection = feedUrl.openConnection();
connection.setConnectTimeout(this.timeOut);
connection.setConnectTimeout(this.timeOut);
SyndFeedInput input = new SyndFeedInput();
SyndFeed feed = input.build(new InputStreamReader(connection.getInputStream()));
for (Object entryObj : feed.getEntries()) {
SyndEntry entry = (SyndEntry) entryObj;
ObjectNode nodeEntry = this.serializer.deserialize(entry);
nodeEntry.put(RSS_KEY, this.rssFeed);
String entryId = determineId(nodeEntry);
batch.add(entryId);
StreamsDatum datum = new StreamsDatum(nodeEntry);
try {
JsonNode published = nodeEntry.get(DATE_KEY);
if (published != null) {
try {
DateTime date = RFC3339Utils.parseToUTC(published.asText());
if (date.isAfter(this.publishedSince) && (!this.perpetual || !seenBefore(entryId, this.rssFeed))) {
this.dataQueue.put(datum);
LOGGER.debug("Added entry, {}, to provider queue.", entryId);
}
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
} catch (Exception ex) {
LOGGER.trace("Failed to parse date from object node, attempting to add node to queue by default.");
if (!this.perpetual || !seenBefore(entryId, this.rssFeed)) {
this.dataQueue.put(datum);
LOGGER.debug("Added entry, {}, to provider queue.", entryId);
}
}
} else {
LOGGER.debug("No published date present, attempting to add node to queue by default.");
if (!this.perpetual || !seenBefore(entryId, this.rssFeed)) {
this.dataQueue.put(datum);
LOGGER.debug("Added entry, {}, to provider queue.", entryId);
}
}
} catch (InterruptedException ie) {
LOGGER.error("Interupted Exception.");
Thread.currentThread().interrupt();
}
}
return batch;
}
/**
* Returns link to the article to use as the id.
* @param node node
* @return String
*/
private String determineId(ObjectNode node) {
String id = null;
if (node.get(URI_KEY) != null && !node.get(URI_KEY).textValue().equals("")) {
id = node.get(URI_KEY).textValue();
} else if (node.get(LINK_KEY) != null && !node.get(LINK_KEY).textValue().equals("")) {
id = node.get(LINK_KEY).textValue();
}
return id;
}
/**
* Returns false if the artile was previously seen in another task for this feed.
* @param id id
* @param rssFeed rssFeed
* @return boolean seenBefore
*/
private boolean seenBefore(String id, String rssFeed) {
Set<String> previousBatch = PREVIOUSLY_SEEN.get(rssFeed);
if (previousBatch == null) {
return false;
}
return previousBatch.contains(id);
}
}