/* Copyright 2012 Tim Garrett, Mothsoft LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.mothsoft.alexis.engine.retrieval;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collections;
import java.util.Comparator;
import java.util.Date;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.springframework.transaction.PlatformTransactionManager;
import org.springframework.transaction.TransactionStatus;
import org.springframework.transaction.support.TransactionCallback;
import org.springframework.transaction.support.TransactionCallbackWithoutResult;
import org.springframework.transaction.support.TransactionTemplate;
import com.mothsoft.alexis.dao.DocumentDao;
import com.mothsoft.alexis.dao.RssFeedDao;
import com.mothsoft.alexis.dao.SourceDao;
import com.mothsoft.alexis.dao.UserDao;
import com.mothsoft.alexis.domain.Document;
import com.mothsoft.alexis.domain.DocumentType;
import com.mothsoft.alexis.domain.DocumentUser;
import com.mothsoft.alexis.domain.RssFeed;
import com.mothsoft.alexis.domain.RssSource;
import com.mothsoft.alexis.domain.User;
import com.mothsoft.alexis.engine.textual.WebContentParser;
import com.mothsoft.alexis.security.CurrentUserUtil;
import com.mothsoft.alexis.util.HttpClientResponse;
import com.mothsoft.alexis.util.NetworkingUtil;
import com.sun.syndication.feed.synd.SyndContent;
import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.io.SyndFeedInput;
public class RssRetrievalTaskImpl implements RetrievalTask {
private static final Logger logger = Logger.getLogger(RssRetrievalTaskImpl.class);
private static final Comparator<SyndEntry> ENTRY_COMPARATOR;
static {
ENTRY_COMPARATOR = new Comparator<SyndEntry>() {
public int compare(SyndEntry e1, SyndEntry e2) {
// arbitrarily sort if no dates are available
final Date date1 = mostRecentOfOrNow(e1.getPublishedDate(), e1.getUpdatedDate());
final Date date2 = mostRecentOfOrNow(e2.getPublishedDate(), e2.getUpdatedDate());
return date1.compareTo(date2);
}
private Date mostRecentOfOrNow(Date... dates) {
Date newest = null;
for (final Date date : dates) {
if (date == null) {
continue;
}
if (newest == null || date.after(newest)) {
newest = date;
}
}
if (newest == null) {
newest = new Date();
}
return newest;
}
};
}
private DocumentDao documentDao;
private RssFeedDao rssFeedDao;
private SourceDao sourceDao;
private UserDao userDao;
private PlatformTransactionManager transactionManager;
private TransactionTemplate transactionTemplate;
private WebContentParser webContentParser;
private IntelligentDelay delay;
public RssRetrievalTaskImpl() {
delay = new IntelligentDelay("RSS Retrieval", 5, 90);
}
public void setDocumentDao(final DocumentDao documentDao) {
this.documentDao = documentDao;
}
public void setRssFeedDao(final RssFeedDao rssFeedDao) {
this.rssFeedDao = rssFeedDao;
}
public void setSourceDao(final SourceDao sourceDao) {
this.sourceDao = sourceDao;
}
public void setUserDao(final UserDao userDao) {
this.userDao = userDao;
}
public void setTransactionManager(final PlatformTransactionManager transactionManager) {
this.transactionManager = transactionManager;
this.transactionTemplate = new TransactionTemplate(this.transactionManager);
}
public void setWebContentParser(final WebContentParser webContentParser) {
this.webContentParser = webContentParser;
}
public void retrieve() {
CurrentUserUtil.setSystemUserAuthentication();
try {
final List<RssFeed> feeds = findFeedsToProcess();
for (final RssFeed feed : feeds) {
handleFeed(feed);
}
if (feeds.isEmpty()) {
logger.info("RSS Retrieval found nothing to do, will sleep");
this.delay.sleep();
} else {
this.delay.reset();
}
} finally {
CurrentUserUtil.clearAuthentication();
}
}
private List<RssFeed> findFeedsToProcess() {
try {
return this.transactionTemplate.execute(new TransactionCallback<List<RssFeed>>() {
public List<RssFeed> doInTransaction(TransactionStatus status) {
return RssRetrievalTaskImpl.this.rssFeedDao.listRssFeedsWithRetrievalDateMoreThanXMinutesAgo(30);
}
});
} catch (final Exception e) {
logger.error("Listing sources for retrieval failed: " + e, e);
return Collections.emptyList();
}
}
private void handleFeed(final RssFeed feed) {
try {
long start = System.currentTimeMillis();
this.transactionTemplate.execute(new TransactionCallbackWithoutResult() {
@Override
protected void doInTransactionWithoutResult(TransactionStatus status) {
handleFeedImpl(feed);
}
});
logger.info("RSS Feed: '" + feed.getUrl() + "' handled in " + (System.currentTimeMillis() - start)
+ " milliseconds.");
} catch (final Exception e) {
logger.warn("RSS Feed: " + feed.getUrl() + " failed retrieval " + e, e);
}
}
private void handleFeedImpl(final RssFeed rssFeed) {
final long start = System.currentTimeMillis();
final Date retrievalDate = new Date(start);
rssFeed.setRetrievalDate(retrievalDate);
final String url = rssFeed.getUrl();
logger.info("Retrieving RSS feed: " + url);
final SyndFeedInput input = new SyndFeedInput();
URL feedUrl;
HttpClientResponse response = null;
InputStream is = null;
try {
feedUrl = new URL(url);
response = NetworkingUtil.get(feedUrl, rssFeed.getEtag(), rssFeed.getLastModifiedDate());
if (response.getStatusCode() == 304) {
logger.info("No RSS feed changes -- skipping");
} else {
is = response.getInputStream();
final SyndFeed feed = input.build(new com.sun.syndication.io.XmlReader(is));
rssFeed.setEtag(response.getEtag());
rssFeed.setLastModifiedDate(response.getLastModifiedDate());
logger.info("Parsing took: " + (System.currentTimeMillis() - start) + " milliseconds");
@SuppressWarnings("unchecked")
final List<SyndEntry> entries = feed.getEntries();
// newer ones listed first--we want the older ones to be
// processed first
Collections.sort(entries, ENTRY_COMPARATOR);
// FIXME - ugly. possible to replace DocumentUser with
// SourceUser and DocumentSource? or just more complicated?
for (final SyndEntry entry : entries) {
handleEntry(rssFeed, entry);
}
logger.info("RSS feed parsing complete.");
}
} catch (Exception e) {
if (response != null) {
response.abort();
}
// FIXME - consider trying again, tallying an error count, and
// waiting 30 minutes after a certain number of errors
logger.error("Error retrieving/parsing RSS feed at: " + url + ", will wait to try again.");
logger.error("RSS Feed Error was: " + e, e);
} finally {
IOUtils.closeQuietly(response);
}
this.rssFeedDao.update(rssFeed);
// FIXME - consider optimizing this with a query
for (final RssSource ithRssSource : rssFeed.getRssSources()) {
ithRssSource.setRetrievalDate(retrievalDate);
this.sourceDao.update(ithRssSource);
}
}
private void handleEntry(final RssFeed rssFeed, final SyndEntry entry) {
logger.info("Entry: " + entry.getLink());
URL url = null;
if (entry.getLink() != null) {
try {
url = new URL(entry.getLink());
} catch (MalformedURLException e1) {
logger.error(" Bad link: " + entry.getLink() + ", skipping");
return;
}
Document document = this.documentDao.findByUrl(url.toExternalForm());
if (document == null) {
final String title = readTitle(entry);
final String description = readDescription(entry);
document = new Document(DocumentType.W, url, title, description);
document.setCreationDate(this.firstNotNull(entry.getPublishedDate(), entry.getUpdatedDate(), new Date()));
this.documentDao.add(document);
} else {
logger.info("Document already exists, will not queue again.");
}
// FIXME - optimize, perhaps with an intelligent query
for (final RssSource ithRssSource : rssFeed.getRssSources()) {
final Long userId = ithRssSource.getUserId();
final User user = this.userDao.get(userId);
final DocumentUser documentUser = new DocumentUser(document, user);
if (!document.getDocumentUsers().contains(documentUser)) {
document.getDocumentUsers().add(documentUser);
}
this.documentDao.update(document);
}
}
}
private Date firstNotNull(Date... dates) {
for (final Date date : dates) {
if (date != null) {
return date;
}
}
return null;
}
private String readDescription(final SyndEntry syndEntry) {
final SyndContent syndContent = syndEntry.getDescription();
return readString(syndContent == null ? null : syndContent.getValue());
}
private String readString(String value) {
if (value == null) {
return null;
}
try {
final byte[] bytes = value.getBytes("UTF-8");
value = new String(bytes, "UTF-8");
final String parsed = this.webContentParser.parseHTML(value);
return parsed;
} catch (IOException e) {
logger.error("Failed to parse string '" + value + "' " + e, e);
return null;
}
}
private String readTitle(final SyndEntry syndEntry) {
return readString(syndEntry == null ? null : syndEntry.getTitle());
}
}