package org.podcastpedia.admin.update;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.SocketTimeoutException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.NoHttpResponseException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpHead;
import org.apache.http.client.utils.DateUtils;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.log4j.Logger;
import org.joda.time.DateTime;
import org.podcastpedia.admin.dao.DeleteDao;
import org.podcastpedia.admin.dao.InsertDao;
import org.podcastpedia.admin.dao.ReadDao;
import org.podcastpedia.admin.dao.UpdateDao;
import org.podcastpedia.admin.dao.helper.InputMarkNewEpisodesAsNew;
import org.podcastpedia.admin.util.PodcastAndEpisodeAttributesService;
import org.podcastpedia.admin.util.SyndFeedService;
import org.podcastpedia.admin.util.read.ReadService;
import org.podcastpedia.common.domain.Episode;
import org.podcastpedia.common.domain.Podcast;
import org.podcastpedia.common.exception.BusinessException;
import org.podcastpedia.common.types.HttpStatusExtensionType;
import org.podcastpedia.common.util.config.ConfigBean;
import org.springframework.beans.factory.annotation.Autowired;
import org.xml.sax.InputSource;
import com.rometools.rome.feed.synd.SyndEntry;
import com.rometools.rome.feed.synd.SyndFeed;
import com.rometools.rome.io.FeedException;
import com.rometools.rome.io.SyndFeedInput;
public class UpdateServiceImpl implements UpdateService {
private static final int TIMEOUT_SECONDS = 10;
private static Logger LOG = Logger.getLogger(UpdateServiceImpl.class);
@Autowired
private DeleteDao deleteDao;
@Autowired
private ReadDao readDao;
@Autowired
private InsertDao insertDao;
@Autowired
private UpdateDao updateDao;
@Autowired
private ReadService readService;
@Autowired
private PodcastAndEpisodeAttributesService podcastAndEpisodeAttributesService;
@Autowired
SyndFeedService syndFeedService;
@Autowired
private ConfigBean configBean;
@Autowired
private PoolingHttpClientConnectionManager poolingHttpClientConnectionManager;
/**
* Entry point method to update the podcast
*
* @throws FeedException
* @throws IllegalArgumentException
* @throws IOException
*/
// @Transactional TODO - uncomment this when migrate to 5.6 innodb and
// lucene for search
public void updatePodcastById(Podcast podcast, Boolean isCalledManually,
boolean isFeedLoadedFromLocalFile) throws IllegalArgumentException,
FeedException, IOException {
Integer podcastId = podcast.getPodcastId();
try {
// when called manually we have to get the necessary attributes for
// update
if (isCalledManually) {
podcast = readDao.getPodcastForUpdateById(podcastId);
}
if (podcast == null)
throw new BusinessException("No podcast found for podcast id ["
+ podcastId + "]");
LOG.info("UPDATING podId[" + podcast.getPodcastId()
+ "] with feed - " + podcast.getUrl());
// ONLY IF has changed or does not support etag and last-modified -
// etag and last-modified attributes are updated
Integer podcastStatus = getFeedUpdateStatus(podcast, podcastId);
boolean checkFeedForUpdate = (podcastStatus == HttpStatusExtensionType.URL_CONTENT_MODIFIED
.getCode()) || isCalledManually;
if (checkFeedForUpdate) {
// get only the episodes that are still marked as available
List<Episode> reachableEpisodes = readDao
.getAvailableEpisodesFromDB(podcastId);
podcast.setEpisodes(reachableEpisodes);
// get the max episode id from the database - we'll add to that
// the new episodes, if any
int maxIndex = readDao.getMaxEpisodeIdForPodcast(podcastId);
// get the new episodes from feed
List<Episode> newEpisodes = getNewEpisodes(podcast, maxIndex,
isFeedLoadedFromLocalFile);
if (newEpisodes.size() > 0) {
podcast.setLastUpdate(new Date());
addNewEpisodes(newEpisodes, podcast.getPodcastId());
}
List<Episode> notReachableEpisodes = getNotReachableEpisodes(podcast);
if (notReachableEpisodes.size() > 0) {
for (Episode e : notReachableEpisodes) {
e.setIsAvailable(0);// TODO make enum out of this
updateDao.updateEpisodeAvailability(e);
}
podcast.setLastUpdate(new Date());
}
// now update also the podcast - the podcast is reachable, and
// transient data (etags, publication_date, last_update, last
// episode url etc.)
podcast.setAvailability(HttpStatus.SC_OK);
updateDao.updateTransientDataForPodcastById(podcast);
} else if (podcastStatus != HttpStatus.SC_OK
&& podcastStatus != HttpStatus.SC_NOT_MODIFIED
&& podcastStatus != HttpStatusExtensionType.SOCKET_TIMEOUT_EXCEPTION
.getCode()) {
// a some sort of error must have happened so we need to update
// the availability of the podcast
podcast.setAvailability(podcastStatus);
updateDao.updatePodcastAvailability(podcast);
}
} catch (Exception e) {
if (e instanceof MalformedURLException) {
LOG.error(
"MalformedURLException podcastId [ "
+ podcast.getPodcastId() + " ] " + "] url ["
+ podcast.getUrl() + "] ", e);
} else if (e instanceof IllegalArgumentException) {
LOG.error(
"IllegalArgumentException podcastId [ "
+ podcast.getPodcastId() + " ] " + "] url ["
+ podcast.getUrl() + "] ", e);
} else if (e instanceof FeedException) {
LOG.error("FeedException podcastId [ " + podcast.getPodcastId()
+ " ] " + "] url [" + podcast.getUrl() + "] ", e);
} else if (e instanceof IOException) {
LOG.error("IOException podcastId [ " + podcast.getPodcastId()
+ " ] " + "] url [" + podcast.getUrl() + "] ", e);
} else if (e instanceof BusinessException) {
LOG.error(
"Business exception podcastId [ "
+ podcast.getPodcastId() + " ] " + "] url ["
+ podcast.getUrl() + "] ", e);
return;
} else {
LOG.error(
"Episodes still reachable but UNKNOWN ERROR when updating the podcastId [ "
+ podcast.getPodcastId() + " ] " + "] url ["
+ podcast.getUrl() + "] ", e);
}
podcast.setAvailability(HttpStatusExtensionType.PODCAST_IN_ERROR
.getCode());
updateDao.updateTransientDataForPodcastById(podcast);
}
}
private void addNewEpisodes(List<Episode> newEpisodes, Integer podcastId) {
try {
updateDao.markAllEpisodesAsNotNew(podcastId);
insertNewEpisodesInDB(newEpisodes);
markNewEpisodesWithNewFlag(newEpisodes, podcastId);
} catch (Exception e) {
LOG.error(" Error when marking new episodes as new PodId["
+ podcastId, e);
}
}
private void insertNewEpisodesInDB(List<Episode> episodes) {
for (Episode episode : episodes) {
try {
// TODO when I move to InnoDB for table and lucene search,
// rollback the transaction with log message when updating the
// podcast
boolean episodeHasLinkToMediaFile = !episode.getMediaUrl()
.equals("noMediaUrl");
if (episodeHasLinkToMediaFile) {
insertDao.insertEpisode(episode);
LOG.info("PodId[" + episode.getPodcastId()
+ "] - INSERT EPISODE epId["
+ episode.getEpisodeId() + "] - epURL "
+ episode.getMediaUrl());
}
} catch (Exception e) {
// TODO Auto-generated catch block
LOG.error(" PodId[" + episode.getPodcastId()
+ "] ERROR inserting episode " + episode.getMediaUrl()
+ " " + e.getMessage());
continue; // do not mark it as new episode
}
}
}
private void markNewEpisodesWithNewFlag(List<Episode> newEpisodes,
Integer podcastId) {
// mark the new episodes as new
InputMarkNewEpisodesAsNew input = new InputMarkNewEpisodesAsNew();
input.setEpisodes(newEpisodes);
input.setPodcastId(podcastId);
updateDao.markNewEpisodesAsNew(input);
}
private List<Episode> getNotReachableEpisodes(Podcast podcast) {
List<Episode> notReachableEpisodes = new ArrayList<Episode>();
// took out of the loop the variables;
String episodeMediaUrl = null;
// loop through all the current episodes of the podcast
for (Episode ep : podcast.getEpisodes()) {
episodeMediaUrl = ep.getMediaUrl();
if (episodeMediaUrl != null
&& !episodeMediaUrl.equals("noMediaUrl")) {
HttpHead headMethod = null;
try {
headMethod = new HttpHead(episodeMediaUrl);
RequestConfig requestConfig = RequestConfig.custom()
.setSocketTimeout(TIMEOUT_SECONDS * 1500)
.setConnectTimeout(TIMEOUT_SECONDS * 1500)
.build();
CloseableHttpClient httpClient = HttpClientBuilder
.create()
.setDefaultRequestConfig(requestConfig)
.setConnectionManager(poolingHttpClientConnectionManager)
.build();
HttpResponse httpResponse = httpClient.execute(headMethod);
int statusCode = httpResponse.getStatusLine()
.getStatusCode();
ep.setAvailability(statusCode);
// some sites don't allow access with the http client, but
// it works in the player
boolean notReachableCondition = !((statusCode == org.apache.http.HttpStatus.SC_OK)
|| (statusCode == org.apache.http.HttpStatus.SC_UNAUTHORIZED) || (statusCode == org.apache.http.HttpStatus.SC_FORBIDDEN));
if (notReachableCondition) {
notReachableEpisodes.add(ep);
LOG.info("EPISODE UNAVAILABLE - PodId["
+ podcast.getPodcastId() + "] - httpStatus "
+ statusCode + " ep_URL[" + ep.getMediaUrl()
+ "] ");
ep.setAvailability(statusCode);
}
} catch (IOException e) {
if (e instanceof SocketTimeoutException) {
LOG.warn("PodId[" + podcast.getPodcastId()
+ "] - socket timeout exception - epId["
+ ep.getEpisodeId() + "]" + " ep_URL["
+ ep.getMediaUrl() + "] ");
continue; // optimistic approach - a 404 should be
// caught by next update
} else if (e instanceof NoHttpResponseException) {
LOG.error(
"PodId["
+ podcast.getPodcastId()
+ "] - no http response exception- edId["
+ ep.getEpisodeId() + "]", e);
} else if (e instanceof UnknownHostException) { // TODO this
// is to
// avoid the
// npr
// podcasts
// problem,
// although
// the links
// are still
// available
// - verify
// in log
// after
// update
LOG.warn("PodId[" + podcast.getPodcastId()
+ "] - unknown host exception - epId["
+ ep.getEpisodeId() + "]" + "ep_URL["
+ ep.getMediaUrl() + "] ");
continue; // optimistic approach - a 404 should be
// caught by next update
} else if (e instanceof ConnectTimeoutException) {
LOG.warn("PodId[" + podcast.getPodcastId()
+ "] - connect timeout exception - epId["
+ ep.getEpisodeId() + "]" + "ep_URL["
+ ep.getMediaUrl() + "] ");
continue; // optimistic approach - a 404 should be
// caught by next update
}
ep.setAvailability(HttpStatusExtensionType.IO_EXCEPTION
.getCode());
notReachableEpisodes.add(ep);
LOG.error("PodId[" + podcast.getPodcastId()
+ "] - IOException - epId[" + ep.getEpisodeId()
+ "]" + "ep_URL[" + ep.getMediaUrl() + "]", e);
continue;
} catch (IllegalArgumentException e) {
ep.setAvailability(HttpStatusExtensionType.ILLEGAL_ARGUMENT_EXCEPTION
.getCode());
notReachableEpisodes.add(ep);
LOG.error(
"PodId[" + podcast.getPodcastId()
+ "] possible false URL - - epId["
+ ep.getEpisodeId() + "]" + "ep_URL["
+ ep.getMediaUrl() + "] ", e);
continue;
} catch (Exception e) {
ep.setAvailability(HttpStatusExtensionType.EXCEPTION
.getCode());
notReachableEpisodes.add(ep);
LOG.error(
"PodId[" + podcast.getPodcastId()
+ "] - UNKNOWN EXCEPTION - epId["
+ ep.getEpisodeId() + "]" + "ep_URL["
+ ep.getMediaUrl() + "]", e);
continue;
} finally {
if (headMethod != null) {
// Release the connection.
headMethod.releaseConnection();
}
}
} else {
LOG.debug("PodId[" + podcast.getPodcastId() + "] - "
+ "NO MEDIA Url epId[" + ep.getEpisodeId() + "]");
notReachableEpisodes.add(ep);
}
}
return notReachableEpisodes;
}
private List<Episode> getNewEpisodes(Podcast podcast, Integer maxIndex,
boolean isFeedLoadedFromLocalFile) throws IOException,
IllegalArgumentException, FeedException, BusinessException {
List<Episode> newEpisodes = new ArrayList<Episode>();
SyndFeed syndFeedForUrl = getSyndFeedForUpdate(podcast,
isFeedLoadedFromLocalFile);
podcast.setPodcastFeed(syndFeedForUrl);
Episode newEpisode = null;
DateTime publicationDateOfNewEpisode = null;
boolean isAlreadyInDB = false;
DateTime publicationDateOfStillReachableEpisode = null;
// iterate through the episodes from the feed to find out which one are
// new and add them to the database
for (SyndEntry entry : (List<SyndEntry>) podcast
.getPodcastFeed().getEntries()) {
newEpisode = new Episode();
newEpisode.setPodcastId(podcast.getPodcastId());
// set new episode's attributes so that we can compare it with the
// ones that are still reachable
podcastAndEpisodeAttributesService.setEpisodeAttributes(newEpisode, podcast, entry);
publicationDateOfNewEpisode = newEpisode.getPublicationDate() != null ? new DateTime(
newEpisode.getPublicationDate()) : null;
if (publicationDateOfNewEpisode == null) {
LOG.warn("PodId[" + podcast.getPodcastId() + "] - "
+ "COULD NOT GET PUBLICATION_DATE -" + "epTitle["
+ newEpisode.getTitle() + "]");
}
// iterate through the stored episodes to see if we need to add it
// or not
Iterator<Episode> episodeIterator = podcast.getEpisodes()
.iterator();
while (episodeIterator.hasNext()) {
Episode stillReachableEpisode = episodeIterator.next();
// verify the existence of the episode in the database
publicationDateOfStillReachableEpisode = stillReachableEpisode
.getPublicationDate() != null ? new DateTime(
stillReachableEpisode.getPublicationDate()) : null;
if (publicationDateOfNewEpisode != null) {
// with this condition is also re-broadcasting supported -
// some producers do that
if (publicationDateOfStillReachableEpisode != null
&& publicationDateOfNewEpisode.getMillis() == publicationDateOfStillReachableEpisode
.getMillis()
&& stillReachableEpisode.getTitle().trim()
.equals(newEpisode.getTitle().trim())
&& stillReachableEpisode.getMediaUrl().trim()
.equals(newEpisode.getMediaUrl().trim())) {
// this entry is already in the database, break
isAlreadyInDB = true;
episodeIterator.remove();// episode is removed so that
// won't be considered when
// episode availability is
// checked in the next
// function call (spares
// HTTP call)
break;
}
} else {
// try matching on title and media url - not that strong
// match
if (stillReachableEpisode.getTitle().trim()
.equals(newEpisode.getTitle().trim())
&& stillReachableEpisode.getMediaUrl().trim()
.equals(newEpisode.getMediaUrl().trim())) {
// this entry is already in the database, break
isAlreadyInDB = true;
episodeIterator.remove();// episode is removed so that
// won't be considered when
// episode availability is
// checked in the next
// function call (spares
// HTTP call)
LOG.warn("PodId[" + podcast.getPodcastId()
+ "] - MATCHED but not on publication DATE - "
+ "epUrl[" + newEpisode.getMediaUrl() + "]");
// TODO maybe persist this message in the database with
// a code ....
break;
}
}
}
// if it is already in the database than continue
if (isAlreadyInDB) {
continue;
} else {
// new episode is only invalidated when it matches date, title
// and media url from the db
// set index to the next episode
newEpisode.setEpisodeId(++maxIndex);
// TODO maybe check if the URL is reachable with http client, be
// we go on trust so far
newEpisode.setAvailability(org.apache.http.HttpStatus.SC_OK);
// add it to the response
newEpisodes.add(newEpisode);
}
}
// by this point podcast episodes should contain the old reachable that
// are not present in the feed itself - spares HTTP calls (see "remove"
// calls above)
// TODO - it's matter between more HTTP calls vs. memory & processing
// consumption if it was to select the second variant
return newEpisodes;
}
private SyndFeed getSyndFeedForUpdate(Podcast podcast,
boolean isFeedLoadedFromLocalFile) throws MalformedURLException,
IOException, FeedException, BusinessException {
SyndFeed syndFeed;
if (isFeedLoadedFromLocalFile) {
syndFeed = getSyndFeedFromLocalFile(configBean
.get("LOCAL_PATH_FOR_FEED"));
if (!syndFeed.getTitle().equalsIgnoreCase(podcast.getTitle())) {
throw new BusinessException(
"The proper file might not have been downloaded locally, please verify again");
}
} else {
syndFeed = syndFeedService.getSyndFeedForUrl(podcast.getUrl());
}
return syndFeed;
}
/**
* calls the feed url to verify if it has been modified since the last call
* and returns the httpStatus if available or error if not
*
* @param podcast
* @return
*/
private Integer getFeedUpdateStatus(Podcast podcast, int podcastId) {
// Create a method instance.
if (podcast == null) {
LOG.error("No podcast anymore for podcastId - " + podcastId);
// move on but needs to be investigated from the log file
return HttpStatusExtensionType.PODCAST_IN_ERROR.getCode();
}
String podcastUrl = podcast.getUrl();
if (podcastUrl == null) {
LOG.error(" URL IS NULLL podcast[" + podcast.getPodcastId() + "]");
// move on but needs to be investigated from the log file
return HttpStatusExtensionType.PODCAST_IN_ERROR.getCode();
}
HttpHead headMethod = null;
try {
headMethod = new HttpHead(podcastUrl);
if (podcast.getEtagHeaderField() != null) {
headMethod.addHeader("If-None-Match",
podcast.getEtagHeaderField());
}
if (podcast.getLastModifiedHeaderField() != null) {
headMethod.addHeader("If-Modified-Since",
podcast.getLastModifiedHeaderFieldStr());
}
RequestConfig requestConfig = RequestConfig.custom()
.setSocketTimeout(TIMEOUT_SECONDS * 1000)
.setConnectTimeout(TIMEOUT_SECONDS * 1000)
.build();
CloseableHttpClient httpClient = HttpClientBuilder
.create()
.setDefaultRequestConfig(requestConfig)
.setConnectionManager(poolingHttpClientConnectionManager)
.build();
HttpResponse httpResponse = httpClient.execute(headMethod);
int statusCode = httpResponse.getStatusLine().getStatusCode();
// if the podcast file has not been modified there is no need to
// update
if (statusCode == org.apache.http.HttpStatus.SC_NOT_MODIFIED) {
LOG.info("PodId[ " + podcast.getPodcastId() + " ]"
+ " pod url[ " + podcast.getUrl() + " ]"
+ " FEED NOT MODIFIED NO UPDATE ");
return statusCode;
} else {
if (statusCode != org.apache.http.HttpStatus.SC_OK) {
LOG.error("PodId[ " + podcast.getPodcastId() + " ]"
+ " pod url[ " + podcast.getUrl()
+ " ] : http status code " + statusCode);
return statusCode;
}
// set the new etag if existent
org.apache.http.Header eTagHeader = httpResponse
.getLastHeader("etag");
if (eTagHeader != null) {
podcast.setEtagHeaderField(eTagHeader.getValue());
}
// set the new "last modified" header field if existent
org.apache.http.Header lastModifiedHeader = httpResponse
.getLastHeader("last-modified");
if (lastModifiedHeader != null) {
podcast.setLastModifiedHeaderField(DateUtils
.parseDate(lastModifiedHeader.getValue()));
podcast.setLastModifiedHeaderFieldStr(lastModifiedHeader
.getValue());
}
return HttpStatusExtensionType.URL_CONTENT_MODIFIED.getCode();
}
} catch (IOException e) {
if (e instanceof SocketTimeoutException) {
LOG.error("PodId[ " + podcast.getPodcastId() + " ]"
+ " pod url[ " + podcast.getUrl()
+ " ] : Socket timeout exception " + e.getMessage());
return HttpStatusExtensionType.SOCKET_TIMEOUT_EXCEPTION
.getCode();
}
LOG.error("PodId[ " + podcast.getPodcastId() + " ]" + " pod url[ "
+ podcast.getUrl() + " ] : fatal transport error: "
+ e.getMessage());
return HttpStatusExtensionType.IO_EXCEPTION.getCode();
} catch (Exception e) {
LOG.error("PodId[ " + podcast.getPodcastId() + " ]" + " pod url[ "
+ podcast.getUrl() + " ] : UNKNOWN EXCEPTION ");
return HttpStatusExtensionType.EXCEPTION.getCode();
} finally {
if (headMethod != null) {
// Release the connection.
headMethod.releaseConnection();
}
}
}
public SyndFeed getSyndFeedFromLocalFile(String filePath)
throws MalformedURLException, IOException,
IllegalArgumentException, FeedException {
SyndFeed feed = null;
FileInputStream fis = null;
try {
fis = new FileInputStream(filePath);
InputSource source = new InputSource(fis);
SyndFeedInput input = new SyndFeedInput();
feed = input.build(source);
} finally {
fis.close();
}
return feed;
}
}