/*
* Carrot2 project.
*
* Copyright (C) 2002-2016, Dawid Weiss, Stanisław Osiński.
* All rights reserved.
*
* Refer to the full license file "carrot2.LICENSE"
* in the root folder of the repository checkout or at:
* http://www.carrot2.org/carrot2.LICENSE
*/
package org.carrot2.source.opensearch;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang.StringEscapeUtils;
import org.carrot2.core.Document;
import org.carrot2.source.SearchEngineResponse;
import org.carrot2.util.StringUtils;
import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.fetcher.FeedFetcher;
import com.sun.syndication.fetcher.FetcherException;
import com.sun.syndication.io.FeedException;
/**
* Utility methods for working with Rome fetcher.
*/
public class RomeFetcherUtils
{
/**
* Fetches an OpenSearch feed from the provided URL and returns the entries as Carrot2
* {@link SearchEngineResponse}.
*
* @param url the OpenSearch feed to fetch
* @param feedFetcher Rome fetcher to use
* @return {@link SearchEngineResponse} containing entries from the feed
*/
@SuppressWarnings("rawtypes")
public static SearchEngineResponse fetchUrl(final String url, FeedFetcher feedFetcher)
throws IOException, FeedException, FetcherException, MalformedURLException
{
/*
* TODO: Rome fetcher uses SUN's HttpClient and opens a persistent HTTP connection
* (background thread that keeps reference to the class loader). This causes minor
* memory leaks when reloading Web applications. Consider: 1) patching rome
* fetcher sources and adding Connection: close to request headers, 2) using
* Apache HttpClient, 3) using manual fetch of the syndication feed.
*/
final SyndFeed feed = feedFetcher.retrieveFeed(new URL(url));
final SearchEngineResponse response = new SearchEngineResponse();
// The documentation does not mention that null value can be returned
// but we've seen a NPE here:
// http://builds.carrot2.org/browse/C2HEAD-SOURCES-4.
if (feed != null)
{
final List entries = feed.getEntries();
for (Iterator it = entries.iterator(); it.hasNext();)
{
final SyndEntry entry = (SyndEntry) it.next();
final Document document = new Document();
document.setField(Document.TITLE, clean(entry.getTitle()));
document.setField(Document.SUMMARY, clean(entry.getDescription()
.getValue()));
document.setField(Document.CONTENT_URL, entry.getLink());
response.results.add(document);
}
}
return response;
}
private static String clean(String string)
{
return StringUtils.removeHtmlTags(StringEscapeUtils.unescapeHtml(string));
}
}