FeedUtils.java example

Explorer

commafeed-master
- src
  - main
    - java
      - com
        commafeed
        CommaFeedApplication.java
        CommaFeedConfiguration.java
        CommaFeedModule.java
        backend
        ContentEncodingInterceptor.java
        FixedSizeSortedSet.java
        HttpGetter.java
        cache
        CacheService.java
        NoopCacheService.java
        RedisCacheService.java
        RedisPoolFactory.java
        dao
        FeedCategoryDAO.java
        FeedDAO.java
        FeedEntryContentDAO.java
        FeedEntryDAO.java
        FeedEntryStatusDAO.java
        FeedEntryTagDAO.java
        FeedSubscriptionDAO.java
        GenericDAO.java
        UnitOfWork.java
        UserDAO.java
        UserRoleDAO.java
        UserSettingsDAO.java
        favicon
        AbstractFaviconFetcher.java
        DefaultFaviconFetcher.java
        FacebookFaviconFetcher.java
        YoutubeFaviconFetcher.java
        feed
        EstimateDirection.java
        FeedEntryKeyword.java
        FeedFetcher.java
        FeedParser.java
        FeedQueues.java
        FeedRefreshContext.java
        FeedRefreshExecutor.java
        FeedRefreshTaskGiver.java
        FeedRefreshUpdater.java
        FeedRefreshWorker.java
        FeedUtils.java
        FetchedFeed.java
        HtmlEntities.java
        model
        AbstractModel.java
        Feed.java
        FeedCategory.java
        FeedEntry.java
        FeedEntryContent.java
        FeedEntryStatus.java
        FeedEntryTag.java
        FeedSubscription.java
        Models.java
        User.java
        UserRole.java
        UserSettings.java
        opml
        OPMLExporter.java
        OPMLImporter.java
        rome
        OPML11Generator.java
        OPML11Parser.java
        RSS090DescriptionConverter.java
        RSS090DescriptionParser.java
        RSSRDF10Parser.java
        service
        DatabaseCleaningService.java
        FeedEntryContentService.java
        FeedEntryFilteringService.java
        FeedEntryService.java
        FeedEntryTagService.java
        FeedService.java
        FeedSubscriptionService.java
        FeedUpdateService.java
        MailService.java
        PasswordEncryptionService.java
        PubSubService.java
        StartupService.java
        UserService.java
        internal
        PostLoginActivities.java
        task
        OldEntriesCleanupTask.java
        OldStatusesCleanupTask.java
        OrphanedContentsCleanupTask.java
        OrphanedFeedsCleanupTask.java
        ScheduledTask.java
        frontend
        auth
        SecurityCheck.java
        SecurityCheckFactory.java
        SecurityCheckFactoryProvider.java
        model
        Category.java
        Entries.java
        Entry.java
        FeedInfo.java
        ServerInfo.java
        Settings.java
        Subscription.java
        UnreadCount.java
        UserModel.java
        request
        AddCategoryRequest.java
        CategoryModificationRequest.java
        CollapseRequest.java
        FeedInfoRequest.java
        FeedMergeRequest.java
        FeedModificationRequest.java
        IDRequest.java
        LoginRequest.java
        MarkRequest.java
        MultipleMarkRequest.java
        PasswordResetRequest.java
        ProfileModificationRequest.java
        RegistrationRequest.java
        StarRequest.java
        SubscribeRequest.java
        TagRequest.java
        resource
        AdminREST.java
        CategoryREST.java
        EntryREST.java
        FeedREST.java
        PubSubHubbubCallbackREST.java
        ServerREST.java
        UserREST.java
        servlet
        AnalyticsServlet.java
        CustomCssServlet.java
        LogoutServlet.java
        NextUnreadServlet.java
        session
        SessionHelper.java
        SessionHelperFactory.java
        SessionHelperFactoryProvider.java
        SessionManagerFactory.java
      - edu
        uci
        ics
        crawler4j
        url
        URLCanonicalizer.java
        UrlResolver.java
  - test
    - java
      - com
        commafeed
        backend
        FixedSizeSortedSetTest.java
        feed
        EstimateDirectionTest.java
        FeedUtilsTest.java
        opml
        OPMLExporterTest.java
        OPMLImporterTest.java
        service
        FeedEntryFilteringServiceTest.java
        UserServiceTest.java
        frontend
        auth
        SecurityCheckFactoryTest.java
        resource
        UserRestTest.java
        session
        SessionHelperTest.java

package com.commafeed.backend.feed;

import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DateUtils;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Document.OutputSettings;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities.EscapeMode;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
import org.w3c.css.sac.InputSource;
import org.w3c.dom.css.CSSStyleDeclaration;

import com.commafeed.backend.feed.FeedEntryKeyword.Mode;
import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedSubscription;
import com.commafeed.frontend.model.Entry;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import com.steadystate.css.parser.CSSOMParser;

import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import lombok.extern.slf4j.Slf4j;

/**
 * Utility methods related to feed handling
 * 
 */
@Slf4j
public class FeedUtils {

	private static final String ESCAPED_QUESTION_MARK = Pattern.quote("?");

	private static final List<String> ALLOWED_IFRAME_CSS_RULES = Arrays.asList("height", "width", "border");
	private static final List<String> ALLOWED_IMG_CSS_RULES = Arrays.asList("display", "width", "height");
	private static final char[] FORBIDDEN_CSS_RULE_CHARACTERS = new char[] { '(', ')' };

	private static final Whitelist WHITELIST = buildWhiteList();

	public static String truncate(String string, int length) {
		if (string != null) {
			string = string.substring(0, Math.min(length, string.length()));
		}
		return string;
	}

	private static synchronized Whitelist buildWhiteList() {
		Whitelist whitelist = new Whitelist();
		whitelist.addTags("a", "b", "blockquote", "br", "caption", "cite", "code", "col", "colgroup", "dd", "div", "dl", "dt", "em", "h1",
				"h2", "h3", "h4", "h5", "h6", "i", "iframe", "img", "li", "ol", "p", "pre", "q", "small", "strike", "strong", "sub", "sup",
				"table", "tbody", "td", "tfoot", "th", "thead", "tr", "u", "ul");

		whitelist.addAttributes("div", "dir");
		whitelist.addAttributes("pre", "dir");
		whitelist.addAttributes("code", "dir");
		whitelist.addAttributes("table", "dir");
		whitelist.addAttributes("p", "dir");
		whitelist.addAttributes("a", "href", "title");
		whitelist.addAttributes("blockquote", "cite");
		whitelist.addAttributes("col", "span", "width");
		whitelist.addAttributes("colgroup", "span", "width");
		whitelist.addAttributes("iframe", "src", "height", "width", "allowfullscreen", "frameborder", "style");
		whitelist.addAttributes("img", "align", "alt", "height", "src", "title", "width", "style");
		whitelist.addAttributes("ol", "start", "type");
		whitelist.addAttributes("q", "cite");
		whitelist.addAttributes("table", "border", "bordercolor", "summary", "width");
		whitelist.addAttributes("td", "border", "bordercolor", "abbr", "axis", "colspan", "rowspan", "width");
		whitelist.addAttributes("th", "border", "bordercolor", "abbr", "axis", "colspan", "rowspan", "scope", "width");
		whitelist.addAttributes("ul", "type");

		whitelist.addProtocols("a", "href", "ftp", "http", "https", "magnet", "mailto");
		whitelist.addProtocols("blockquote", "cite", "http", "https");
		whitelist.addProtocols("img", "src", "http", "https");
		whitelist.addProtocols("q", "cite", "http", "https");

		whitelist.addEnforcedAttribute("a", "target", "_blank");
		whitelist.addEnforcedAttribute("a", "rel", "noreferrer");
		return whitelist;
	}

	/**
	 * Detect feed encoding by using the declared encoding in the xml processing instruction and by detecting the characters used in the
	 * feed
	 * 
	 */
	public static Charset guessEncoding(byte[] bytes) {
		String extracted = extractDeclaredEncoding(bytes);
		if (StringUtils.startsWithIgnoreCase(extracted, "iso-8859-")) {
			if (StringUtils.endsWith(extracted, "1") == false) {
				return Charset.forName(extracted);
			}
		} else if (StringUtils.startsWithIgnoreCase(extracted, "windows-")) {
			return Charset.forName(extracted);
		}
		return detectEncoding(bytes);
	}

	/**
	 * Detect encoding by analyzing characters in the array
	 */
	public static Charset detectEncoding(byte[] bytes) {
		String encoding = "UTF-8";

		CharsetDetector detector = new CharsetDetector();
		detector.setText(bytes);
		CharsetMatch match = detector.detect();
		if (match != null) {
			encoding = match.getName();
		}
		if (encoding.equalsIgnoreCase("ISO-8859-1")) {
			encoding = "windows-1252";
		}
		return Charset.forName(encoding);
	}

	public static String replaceHtmlEntitiesWithNumericEntities(String source) {
		return StringUtils.replaceEach(source, HtmlEntities.HTML_ENTITIES, HtmlEntities.NUMERIC_ENTITIES);
	}

	/**
	 * Normalize the url. The resulting url is not meant to be fetched but rather used as a mean to identify a feed and avoid duplicates
	 */
	public static String normalizeURL(String url) {
		if (url == null) {
			return null;
		}
		String normalized = URLCanonicalizer.getCanonicalURL(url);
		if (normalized == null) {
			normalized = url;
		}

		// convert to lower case, the url probably won't work in some cases
		// after that but we don't care we just want to compare urls to avoid
		// duplicates
		normalized = normalized.toLowerCase();

		// store all urls as http
		if (normalized.startsWith("https")) {
			normalized = "http" + normalized.substring(5);
		}

		// remove the www. part
		normalized = normalized.replace("//www.", "//");

		// feedproxy redirects to feedburner
		normalized = normalized.replace("feedproxy.google.com", "feeds.feedburner.com");

		// feedburner feeds have a special treatment
		if (normalized.split(ESCAPED_QUESTION_MARK)[0].contains("feedburner.com")) {
			normalized = normalized.replace("feeds2.feedburner.com", "feeds.feedburner.com");
			normalized = normalized.split(ESCAPED_QUESTION_MARK)[0];
			normalized = StringUtils.removeEnd(normalized, "/");
		}

		return normalized;
	}

	/**
	 * Extract the declared encoding from the xml
	 */
	public static String extractDeclaredEncoding(byte[] bytes) {
		int index = ArrayUtils.indexOf(bytes, (byte) '>');
		if (index == -1) {
			return null;
		}

		String pi = new String(ArrayUtils.subarray(bytes, 0, index + 1)).replace('\'', '"');
		index = StringUtils.indexOf(pi, "encoding=\"");
		if (index == -1) {
			return null;
		}
		String encoding = pi.substring(index + 10, pi.length());
		encoding = encoding.substring(0, encoding.indexOf('"'));
		return encoding;
	}

	public static String handleContent(String content, String baseUri, boolean keepTextOnly) {
		if (StringUtils.isNotBlank(content)) {
			baseUri = StringUtils.trimToEmpty(baseUri);

			Document dirty = Jsoup.parseBodyFragment(content, baseUri);
			Cleaner cleaner = new Cleaner(WHITELIST);
			Document clean = cleaner.clean(dirty);

			for (Element e : clean.select("iframe[style]")) {
				String style = e.attr("style");
				String escaped = escapeIFrameCss(style);
				e.attr("style", escaped);
			}

			for (Element e : clean.select("img[style]")) {
				String style = e.attr("style");
				String escaped = escapeImgCss(style);
				e.attr("style", escaped);
			}

			clean.outputSettings(new OutputSettings().escapeMode(EscapeMode.base).prettyPrint(false));
			Element body = clean.body();
			if (keepTextOnly) {
				content = body.text();
			} else {
				content = body.html();
			}
		}
		return content;
	}

	public static String escapeIFrameCss(String orig) {
		String rule = "";
		CSSOMParser parser = new CSSOMParser();
		try {
			List<String> rules = new ArrayList<>();
			CSSStyleDeclaration decl = parser.parseStyleDeclaration(new InputSource(new StringReader(orig)));

			for (int i = 0; i < decl.getLength(); i++) {
				String property = decl.item(i);
				String value = decl.getPropertyValue(property);
				if (StringUtils.isBlank(property) || StringUtils.isBlank(value)) {
					continue;
				}

				if (ALLOWED_IFRAME_CSS_RULES.contains(property) && StringUtils.containsNone(value, FORBIDDEN_CSS_RULE_CHARACTERS)) {
					rules.add(property + ":" + decl.getPropertyValue(property) + ";");
				}
			}
			rule = StringUtils.join(rules, "");
		} catch (Exception e) {
			log.error(e.getMessage(), e);
		}
		return rule;
	}

	public static String escapeImgCss(String orig) {
		String rule = "";
		CSSOMParser parser = new CSSOMParser();
		try {
			List<String> rules = new ArrayList<>();
			CSSStyleDeclaration decl = parser.parseStyleDeclaration(new InputSource(new StringReader(orig)));

			for (int i = 0; i < decl.getLength(); i++) {
				String property = decl.item(i);
				String value = decl.getPropertyValue(property);
				if (StringUtils.isBlank(property) || StringUtils.isBlank(value)) {
					continue;
				}

				if (ALLOWED_IMG_CSS_RULES.contains(property) && StringUtils.containsNone(value, FORBIDDEN_CSS_RULE_CHARACTERS)) {
					rules.add(property + ":" + decl.getPropertyValue(property) + ";");
				}
			}
			rule = StringUtils.join(rules, "");
		} catch (Exception e) {
			log.error(e.getMessage(), e);
		}
		return rule;
	}

	public static boolean isRTL(FeedEntry entry) {
		String text = entry.getContent().getContent();

		if (StringUtils.isBlank(text)) {
			text = entry.getContent().getTitle();
		}

		if (StringUtils.isBlank(text)) {
			return false;
		}

		text = Jsoup.parse(text).text();
		if (StringUtils.isBlank(text)) {
			return false;
		}

		return EstimateDirection.isRTL(text);
	}

	public static String trimInvalidXmlCharacters(String xml) {
		if (StringUtils.isBlank(xml)) {
			return null;
		}
		StringBuilder sb = new StringBuilder();

		boolean firstTagFound = false;
		for (int i = 0; i < xml.length(); i++) {
			char c = xml.charAt(i);

			if (!firstTagFound) {
				if (c == '<') {
					firstTagFound = true;
				} else {
					continue;
				}
			}

			if (c >= 32 || c == 9 || c == 10 || c == 13) {
				if (!Character.isHighSurrogate(c) && !Character.isLowSurrogate(c)) {
					sb.append(c);
				}
			}
		}
		return sb.toString();
	}

	/**
	 * When there was an error fetching the feed
	 * 
	 */
	public static Date buildDisabledUntil(int errorCount) {
		Date now = new Date();
		int retriesBeforeDisable = 3;

		if (errorCount >= retriesBeforeDisable) {
			int disabledHours = errorCount - retriesBeforeDisable + 1;
			disabledHours = Math.min(24 * 7, disabledHours);
			return DateUtils.addHours(now, disabledHours);
		}
		return now;
	}

	/**
	 * When the feed was refreshed successfully
	 */
	public static Date buildDisabledUntil(Date publishedDate, Long averageEntryInterval, Date defaultRefreshInterval) {
		Date now = new Date();

		if (publishedDate == null) {
			// feed with no entries, recheck in 24 hours
			return DateUtils.addHours(now, 24);
		} else if (publishedDate.before(DateUtils.addMonths(now, -1))) {
			// older than a month, recheck in 24 hours
			return DateUtils.addHours(now, 24);
		} else if (publishedDate.before(DateUtils.addDays(now, -14))) {
			// older than two weeks, recheck in 12 hours
			return DateUtils.addHours(now, 12);
		} else if (publishedDate.before(DateUtils.addDays(now, -7))) {
			// older than a week, recheck in 6 hours
			return DateUtils.addHours(now, 6);
		} else if (averageEntryInterval != null) {
			// use average time between entries to decide when to refresh next, divided by factor
			int factor = 2;

			// not more than 6 hours
			long date = Math.min(DateUtils.addHours(now, 6).getTime(), now.getTime() + averageEntryInterval / factor);

			// not less than default refresh interval
			date = Math.max(defaultRefreshInterval.getTime(), date);

			return new Date(date);
		} else {
			// unknown case, recheck in 24 hours
			return DateUtils.addHours(now, 24);
		}
	}

	public static Long averageTimeBetweenEntries(List<FeedEntry> entries) {
		if (entries.isEmpty() || entries.size() == 1) {
			return null;
		}

		List<Long> timestamps = getSortedTimestamps(entries);

		SummaryStatistics stats = new SummaryStatistics();
		for (int i = 0; i < timestamps.size() - 1; i++) {
			long diff = Math.abs(timestamps.get(i) - timestamps.get(i + 1));
			stats.addValue(diff);
		}
		return (long) stats.getMean();
	}

	public static List<Long> getSortedTimestamps(List<FeedEntry> entries) {
		return entries.stream().map(t -> t.getUpdated().getTime()).sorted(Collections.reverseOrder()).collect(Collectors.toList());
	}

	public static String removeTrailingSlash(String url) {
		if (url.endsWith("/")) {
			url = url.substring(0, url.length() - 1);
		}
		return url;
	}

	/**
	 * 
	 * @param url
	 *            the url of the entry
	 * @param feedLink
	 *            the url of the feed as described in the feed
	 * @param feedUrl
	 *            the url of the feed that we used to fetch the feed
	 * @return an absolute url pointing to the entry
	 */
	public static String toAbsoluteUrl(String url, String feedLink, String feedUrl) {
		url = StringUtils.trimToNull(StringUtils.normalizeSpace(url));
		if (url == null || url.startsWith("http")) {
			return url;
		}

		String baseUrl = (feedLink == null || isRelative(feedLink)) ? feedUrl : feedLink;

		if (baseUrl == null) {
			return url;
		}

		String result = null;
		try {
			result = new URL(new URL(baseUrl), url).toString();
		} catch (MalformedURLException e) {
			log.debug("could not parse url : " + e.getMessage(), e);
			result = url;
		}

		return result;
	}

	public static boolean isRelative(final String url) {
		// the regex means "start with 'scheme://'"
		return url.startsWith("/") || url.startsWith("#") || !url.matches("^\\w+\\:\\/\\/.*");
	}

	public static String getFaviconUrl(FeedSubscription subscription, String publicUrl) {
		return removeTrailingSlash(publicUrl) + "/rest/feed/favicon/" + subscription.getId();
	}

	public static String proxyImages(String content, String publicUrl) {
		if (StringUtils.isBlank(content)) {
			return content;
		}

		Document doc = Jsoup.parse(content);
		Elements elements = doc.select("img");
		for (Element element : elements) {
			String href = element.attr("src");
			if (href != null) {
				String proxy = proxyImage(href, publicUrl);
				element.attr("src", proxy);
			}
		}

		return doc.body().html();
	}

	public static String proxyImage(String url, String publicUrl) {
		if (StringUtils.isBlank(url)) {
			return url;
		}
		return removeTrailingSlash(publicUrl) + "/rest/server/proxy?u=" + imageProxyEncoder(url);
	}

	public static String rot13(String msg) {
		StringBuilder message = new StringBuilder();

		for (char c : msg.toCharArray()) {
			if (c >= 'a' && c <= 'm')
				c += 13;
			else if (c >= 'n' && c <= 'z')
				c -= 13;
			else if (c >= 'A' && c <= 'M')
				c += 13;
			else if (c >= 'N' && c <= 'Z')
				c -= 13;
			message.append(c);
		}

		return message.toString();
	}

	public static String imageProxyEncoder(String url) {
		return Base64.encodeBase64String(rot13(url).getBytes());
	}

	public static String imageProxyDecoder(String code) {
		return rot13(new String(Base64.decodeBase64(code)));
	}

	public static void removeUnwantedFromSearch(List<Entry> entries, List<FeedEntryKeyword> keywords) {
		Iterator<Entry> it = entries.iterator();
		while (it.hasNext()) {
			Entry entry = it.next();
			boolean keep = true;
			for (FeedEntryKeyword keyword : keywords) {
				String title = entry.getTitle() == null ? null : Jsoup.parse(entry.getTitle()).text();
				String content = entry.getContent() == null ? null : Jsoup.parse(entry.getContent()).text();
				boolean condition = !StringUtils.containsIgnoreCase(content, keyword.getKeyword())
						&& !StringUtils.containsIgnoreCase(title, keyword.getKeyword());
				if (keyword.getMode() == Mode.EXCLUDE) {
					condition = !condition;
				}
				if (condition) {
					keep = false;
					break;
				}
			}
			if (!keep) {
				it.remove();
			}
		}
	}
}