/* * Created by Angel Leon (@gubatron), Alden Torres (aldenml) * Copyright (c) 2011-2014,, FrostWire(R). All rights reserved. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.frostwire.search.torrent; import java.util.LinkedList; import java.util.List; import com.frostwire.logging.Logger; import com.frostwire.search.CrawlRegexSearchPerformer; import com.frostwire.search.CrawlableSearchResult; import com.frostwire.search.MaxIterCharSequence; import com.frostwire.search.PerformersHelper; import com.frostwire.search.SearchMatcher; import com.frostwire.search.SearchResult; import com.frostwire.search.domainalias.DomainAliasManager; import com.google.code.regexp.Matcher; import com.google.code.regexp.Pattern; /** * * @author gubatron * @author aldenml * */ public abstract class TorrentRegexSearchPerformer<T extends CrawlableSearchResult> extends CrawlRegexSearchPerformer<CrawlableSearchResult> { private final Pattern preliminarSearchResultspattern; private final Pattern htmlDetailPagePattern; private final static Logger LOG = Logger.getLogger(TorrentRegexSearchPerformer.class); public TorrentRegexSearchPerformer(DomainAliasManager domainAliasManager, long token, String keywords, int timeout, int pages, int numCrawls, int regexMaxResults, String preliminarSearchResultsRegex, String htmlDetailPagePatternRegex) { super(domainAliasManager, token, keywords, timeout, pages, numCrawls, regexMaxResults); this.preliminarSearchResultspattern = Pattern.compile(preliminarSearchResultsRegex); this.htmlDetailPagePattern = Pattern.compile(htmlDetailPagePatternRegex); } @Override public Pattern getPattern() { return preliminarSearchResultspattern; } @Override protected String getCrawlUrl(CrawlableSearchResult sr) { String crawlUrl = null; if (sr instanceof TorrentCrawlableSearchResult) { crawlUrl = ((TorrentCrawlableSearchResult) sr).getTorrentUrl(); } else { crawlUrl = sr.getDetailsUrl(); } return crawlUrl; } @Override protected List<? extends SearchResult> crawlResult(CrawlableSearchResult sr, byte[] data) throws Exception { List<SearchResult> list = new LinkedList<SearchResult>(); if (sr instanceof TorrentCrawlableSearchResult) { //in case we fetched a torrent's info (magnet, or the .torrent itself) to obtain list.addAll(PerformersHelper.crawlTorrent(this, (TorrentCrawlableSearchResult) sr, data)); } else { String html = reduceHtml(new String(data, "UTF-8")); if (html != null) { Matcher matcher = htmlDetailPagePattern.matcher(new MaxIterCharSequence(html, 2 * html.length())); try { if (matcher.find()) { T searchResult = fromHtmlMatcher(sr, SearchMatcher.from(matcher)); if (searchResult != null) { list.add(searchResult); } } else { LOG.error("Update Necessary: Search broken for " + sr.getClass().getPackage().getName() + " (please notify dev-team on twitter @frostwire or write to contact@frostwire.com if you keep seeing this message.)"); } } catch (Exception e) { throw new Exception("URL:" + sr.getDetailsUrl(), e); } } else { LOG.error("Update Necessary: HTML could not be reduced for optimal search. Search broken for " + sr.getClass().getPackage().getName() + " (please notify dev-team on twitter @frostwire or write to contact@frostwire.com if you keep seeing this message.)"); } } return list; } /** * Sometimes the HTML_REGEX has to work on too big of an HTML file. * In order to minimize the chance for long backtracking times we can * override this methods to specify what offsets of the HTML file our * REGEX should be focusing on. */ protected int prefixOffset(String html) { return 0; } /** * Sometimes the HTML_REGEX has to work on too big of an HTML file. * In order to minimize the chance for long backtracking times we can * override this methods to specify what offsets of the HTML file our * REGEX should be focusing on. */ protected int suffixOffset(String html) { return html.length(); } private String reduceHtml(String html) { int preOffset = prefixOffset(html); int sufOffset = suffixOffset(html); if (preOffset == -1 || sufOffset == -1) { html = null; } else if (preOffset > 0 || sufOffset < html.length()) { html = new String(html.substring(preOffset, sufOffset).toCharArray()); } return html; } protected abstract T fromHtmlMatcher(CrawlableSearchResult sr, SearchMatcher matcher); }