/* * PROPRIETARY and CONFIDENTIAL * * Copyright 2012 Magellan Distribution Corporation * * All rights reserved. */ package com.ajah.syndicate.fetch; import java.io.IOException; import lombok.extern.java.Log; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import com.ajah.spring.jdbc.err.DataOperationException; import com.ajah.syndicate.FeedSource; import com.ajah.syndicate.data.FeedEntryManager; import com.ajah.syndicate.data.FeedManager; import com.ajah.syndicate.data.FeedSourceManager; /** * Simple utility for finding {@link FeedSource}s. * * @author <a href="http://efsavage.com">Eric F. Savage</a>, <a * href="mailto:code@efsavage.com">code@efsavage.com</a>. */ @Log @Component public class FeedDiscovery { @Autowired FeedSourceManager feedSourceManager; @Autowired FeedManager feedManager; @Autowired FeedEntryManager entryManager; /** * Pulls a page and attempts to discover a feed for it via * link[rel='alternate']. * * @param url * The URL of the page to try and discover the feed for. * @return The feedsource if matched or created, may be null. * @throws ClientProtocolException * If the page could not be pulled. * @throws IOException * If the page could not be pulled. * @throws DataOperationException * If a query could not be executed. */ public FeedSource discover(final String url) throws ClientProtocolException, IOException, DataOperationException { log.fine("Discovering feed for " + url); try (final CloseableHttpClient client = HttpClientBuilder.create().build()) { final HttpGet get = new HttpGet(url); try (final CloseableHttpResponse response = client.execute(get)) { final String html = EntityUtils.toString(response.getEntity()); final Document doc = Jsoup.parse(html); final Elements alternateLinks = doc.select("link"); for (final Element alternateLink : alternateLinks) { if ("alternate".equals(alternateLink.attr("rel"))) { if ("application/rss+xml".equals(alternateLink.attr("type"))) { log.fine("Found rss link " + alternateLink.attr("href")); final String rss = alternateLink.attr("href"); return this.feedSourceManager.findOrCreateByFeedUrl(rss); } log.fine("Found alternate link " + alternateLink.html()); } else { log.fine("Found link " + alternateLink.html()); } } } } return null; } }