package org.limewire.core.impl.search.torrentweb;
import java.net.URI;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.http.client.utils.URIUtils;
import org.limewire.collection.Tuple;
import org.limewire.logging.Log;
import org.limewire.logging.LogFactory;
import org.limewire.util.FileUtils;
import org.limewire.util.StringUtils;
import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableList;
import com.google.inject.Inject;
import com.google.inject.assistedinject.Assisted;
/**
* Implements {@link TorrentUriPrioritizer} by performing the following steps
* on a list of uris:
* <ul>
* <li>Remove duplicate uris</li>
* <li>Remove uris that are known to not be torrent uris</li>
* <li>Calculate a likelihood score for a uri being a torrent uri</li>
* <li>Sort list by this score</li>
* <li>Cap list to the top n elements in it</li>
* </ul>
*/
public class TorrentUriPrioritizerImpl implements TorrentUriPrioritizer {
private static final Log LOG = LogFactory.getLog(TorrentUriPrioritizerImpl.class);
private final static Pattern numbers = Pattern.compile("[0-9]+");
private final static int MAX_URIS = 20;
private final String referrerHost;
/**
* List of predicates contributing to the likelihood score of a uri
* being a torrent uri. They are ordered by weight.
*/
@SuppressWarnings("unchecked")
private final List<Predicate<URI>> predicates = ImmutableList.of(
new IsTorrentUriPredicate(),
new IsMagnetUriPredicate(),
new UriSimilarToOtherTorrentUriPredicate(),
new UriEndsWithTorrentPredicate(),
new UriContainsQueryPredicate(),
new UriOnSameHostAsReferrerPredicate());
/**
* Tokenized and lowercased query tokens.
*/
private final String[] queryTokens;
private final TorrentUriStore torrentUriStore;
@Inject
public TorrentUriPrioritizerImpl(@Assisted URI referrer, @Assisted String query,
TorrentUriStore torrentUriStore) {
this.torrentUriStore = torrentUriStore;
this.queryTokens = toLowerCase(query.split("\\s"));
String host = org.limewire.util.URIUtils.getCanonicalHost(referrer);
this.referrerHost = host != null ? host : "";
}
@Override
public List<URI> prioritize(List<URI> candidates) {
// remove duplicates
candidates = uniquify(candidates);
// remove known non torrent uris
int size = candidates.size();
candidates = filter(candidates, new NotTorrentUriPredicate());
LOG.debugf("removed non torrents: {0}, new size {1}", size - candidates.size(), candidates.size());
// compute scores
List<Tuple<URI, Integer>> scoredUris = transform(candidates, new TorrentUriLikelihoodFunction());
// sort by how likely a candidate
Collections.sort(scoredUris, new ScoreComparator());
// only look at the top n uris
scoredUris = scoredUris.subList(0, Math.min(scoredUris.size(), MAX_URIS));
// transform back
return transform(scoredUris, new UriExtractor());
}
/**
* Applies <code>function</code> to elements of <code>list</code> and
* returns a new {@link ArrayList} with results.
*/
static <S, T> List<T> transform(List<S> list, Function<S, T> function) {
List<T> transformed = new ArrayList<T>(list.size());
for (S element : list) {
transformed.add(function.apply(element));
}
return transformed;
}
/**
* Filters elements of <code>list</code> using <code>predicate</code>
* keeping elements that the predicated applies to and returns a new
* {@link ArrayList} with those elements.
*/
<T> List<T> filter(List<T> list, Predicate<T> predicate) {
List<T> filtered = new ArrayList<T>(list.size());
for (T element : list) {
if (predicate.apply(element)) {
filtered.add(element);
}
}
return filtered;
}
/**
* Uniquifies list by converting it to a hash set and then back to an
* {@link ArrayList}.
*/
private List<URI> uniquify(List<URI> candidates) {
return new ArrayList<URI>(new HashSet<URI>(candidates));
}
@Override
public void setIsTorrent(URI uri, boolean isTorrent) {
torrentUriStore.setIsTorrentUri(uri, isTorrent);
if (isTorrent) {
String host = org.limewire.util.URIUtils.getCanonicalHost(uri);
String path = uri.getPath();
if (host == null || path == null) {
LOG.debugf("host or path null {0}, {1}", host, path);
return;
}
List<String> tokens = tokenize(path);
String canonicalPath = "/" + StringUtils.explode(tokens, "/");
uri = URIUtils.resolve(uri, canonicalPath);
LOG.debugf("canonicalized uri: {0}", uri);
torrentUriStore.addCanonicalTorrentUri(host, uri);
}
}
Set<URI> getTorrentUrisForDomain(URI uri) {
String host = org.limewire.util.URIUtils.getCanonicalHost(uri);
if (host != null) {
return torrentUriStore.getTorrentUrisForHost(host);
}
return Collections.emptySet();
}
/**
* @return true if <code>uri</code> is structurally similar to any of the
* uris in <code>torrentUris</code>
*/
boolean isStructurallySimilar(URI uri, Iterable<URI> torrentUris) {
for (URI torrentUri : torrentUris) {
if (isStructurallySimilar(uri, torrentUri)) {
return true;
}
}
return false;
}
/**
* Tokenizes a uri path, normalizing it and replacing the query and numerical
* elements in the path with placeholders.
*/
List<String> tokenize(String path) {
String[] tokens = path.split("[/?#]");
List<String> canonicalized = new ArrayList<String>(tokens.length);
for (String token : tokens) {
if (containsQuery(token)) {
canonicalized.add("*query*");
} else if (numbers.matcher(token).matches()) {
canonicalized.add("*numbers*");
} else if (!token.isEmpty()) {
canonicalized.add(token);
}
}
return canonicalized;
}
/**
* @return true if the two uris are structurally similar, for that their
* paths are {@link #tokenize(String) tokenized} and then compared
*/
boolean isStructurallySimilar(URI uri, URI torrentUri) {
String path = uri.getPath();
String torrentPath = torrentUri.getPath();
if (path == null || torrentPath == null) {
return false;
}
int score = 0;
List<String> pathTokens = tokenize(path);
List<String> torrentPathTokens = tokenize(torrentPath);
if (pathTokens.size() == torrentPathTokens.size()) {
score += 1;
}
for (Tuple<String, String> tuple : zip(pathTokens, torrentPathTokens)) {
if (tuple.getFirst().equalsIgnoreCase(tuple.getSecond())) {
score += 1;
} else {
score -= 1;
}
}
return score > 3;
}
/**
* @return true if <code>value</code> contains all query tokens
*/
boolean containsQuery(String value) {
value = value.toLowerCase();
for (String token : queryTokens) {
if (!value.contains(token)) {
return false;
}
}
return true;
}
/**
* @return the likelihood score of a uri being a torrent uri
*/
private int computeScore(URI uri) {
int score = 0;
for (Predicate<URI> predicate : predicates) {
if (predicate.apply(uri)) {
score += 1;
}
score <<= 1;
}
return score;
}
private static String[] toLowerCase(String...tokens) {
List<String> results = new ArrayList<String>(tokens.length);
for (String token : tokens) {
results.add(token.toLowerCase());
}
return results.toArray(new String[results.size()]);
}
private class IsTorrentUriPredicate implements Predicate<URI> {
@Override
public boolean apply(URI uri) {
return torrentUriStore.isTorrentUri(uri);
}
}
private class IsMagnetUriPredicate implements Predicate<URI> {
@Override
public boolean apply(URI uri) {
return "magnet".equalsIgnoreCase(uri.getScheme());
}
}
private class UriSimilarToOtherTorrentUriPredicate implements Predicate<URI> {
@Override
public boolean apply(URI uri) {
Set<URI> torrentUris = getTorrentUrisForDomain(uri);
if (torrentUris.isEmpty()) {
return false;
}
return isStructurallySimilar(uri, torrentUris);
}
}
private class UriContainsQueryPredicate implements Predicate<URI> {
@Override
public boolean apply(URI uri) {
return containsQuery(uri.toString());
}
}
private class UriEndsWithTorrentPredicate implements Predicate<URI> {
@Override
public boolean apply(URI uri) {
String path = uri.getPath();
if (path != null) {
return FileUtils.getFileExtension(uri.getPath()).equalsIgnoreCase("torrent");
}
return false;
}
}
private class UriOnSameHostAsReferrerPredicate implements Predicate<URI> {
@Override
public boolean apply(URI uri) {
return referrerHost.equals(org.limewire.util.URIUtils.getCanonicalHost(uri));
}
}
private class NotTorrentUriPredicate implements Predicate<URI> {
@Override
public boolean apply(URI uri) {
return !torrentUriStore.isNotTorrentUri(uri);
}
}
private class TorrentUriLikelihoodFunction implements Function<URI, Tuple<URI, Integer>> {
@Override
public Tuple<URI, Integer> apply(URI uri) {
return new Tuple<URI, Integer>(uri, computeScore(uri));
}
}
private class ScoreComparator implements Comparator<Tuple<URI, Integer>> {
@Override
public int compare(Tuple<URI, Integer> o1, Tuple<URI, Integer> o2) {
return o2.getSecond().compareTo(o1.getSecond());
}
}
private class UriExtractor implements Function<Tuple<URI, Integer>, URI> {
@Override
public URI apply(Tuple<URI, Integer> tuple) {
return tuple.getFirst();
}
}
/**
* @return an iterable of tuples from <code>iterableS</code> and <code>iterableT</code>
*/
static <S, T> Iterable<Tuple<S, T>> zip(final Iterable<S> iterableS, final Iterable<T> iterableT) {
return new Iterable<Tuple<S,T>>() {
@Override
public Iterator<Tuple<S, T>> iterator() {
return new ZipIterator<S, T>(iterableS.iterator(), iterableT.iterator());
}
};
}
private static class ZipIterator<S, T> implements Iterator<Tuple<S, T>> {
private final Iterator<S> iteratorS;
private final Iterator<T> iteratorT;
public ZipIterator(Iterator<S> iteratorS, Iterator<T> iteratorT) {
this.iteratorS = iteratorS;
this.iteratorT = iteratorT;
}
@Override
public boolean hasNext() {
return iteratorS.hasNext() && iteratorT.hasNext();
}
@Override
public Tuple<S, T> next() {
return new Tuple<S, T>(iteratorS.next(), iteratorT.next());
}
@Override
public void remove() {
iteratorS.remove();
iteratorT.remove();
}
}
}