package org.limewire.core.impl.search.torrentweb; import java.io.BufferedOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.net.URI; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.client.methods.HttpGet; import org.limewire.http.httpclient.LimeHttpClient; import org.limewire.http.httpclient.robots.RobotsDirectives; import org.limewire.http.httpclient.robots.RobotsTxt; import org.limewire.io.IOUtils; import org.limewire.io.InvalidDataException; import org.limewire.logging.Log; import org.limewire.logging.LogFactory; import org.limewire.util.FileUtils; import org.limewire.util.StringUtils; import org.limewire.util.URIUtils; import com.google.inject.Inject; import com.google.inject.Provider; import com.limegroup.gnutella.util.LimeWireUtils; /** * Implements {@link TorrentRobotsTxt} by first doing a local lookup for * a cached robots.txt file in {@link TorrentRobotsTxtStore}, otherwise downloads * robotst.txt from webserver and stores it in {@link TorrentRobotsTxtStore}. */ public class TorrentRobotsTxtImpl implements TorrentRobotsTxt { private final static Log LOG = LogFactory.getLog(TorrentRobotsTxtImpl.class); private final Provider<LimeHttpClient> limeHttpClient; private final TorrentRobotsTxtStore torrentRobotsTxtStore; @Inject public TorrentRobotsTxtImpl(Provider<LimeHttpClient> limeHttpClient, TorrentRobotsTxtStore torrentRobotsTxtStore) { this.limeHttpClient = limeHttpClient; this.torrentRobotsTxtStore = torrentRobotsTxtStore; } @Override public boolean isAllowed(URI uri) { String host = URIUtils.getCanonicalHost(uri); if (host == null || uri.getPath() == null) { return true; } String robotsTxt = torrentRobotsTxtStore.getRobotsTxt(host); if (robotsTxt == null) { robotsTxt = getRobotsTxt(uri); torrentRobotsTxtStore.storeRobotsTxt(host, robotsTxt); } try { RobotsTxt parser = new RobotsTxt(robotsTxt); return isAllowed(parser, uri); } catch (InvalidDataException e) { LOG.debug("error parsing robots txt", e); return true; } } /** * @return empty string on error to avoid future requests for robots.txt */ private String getRobotsTxt(URI uri) { LimeHttpClient httpClient = limeHttpClient.get(); HttpGet get = new HttpGet(org.apache.http.client.utils.URIUtils.resolve(uri, "/robots.txt")); HttpResponse response = null; BufferedOutputStream out = null; try { response = httpClient.execute(get); if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { HttpEntity entity = response.getEntity(); if (entity == null) { return ""; } File file = createTmpFile(uri); out = new BufferedOutputStream(new FileOutputStream(file)); entity.writeTo(out); out.close(); if (file.length() < TorrentRobotsTxtStore.MAX_ROBOTS_TXT_SIZE) { byte[] contents = FileUtils.readFileFully(file); if (contents != null) { return StringUtils.toUTF8String(contents); } } } } catch (IOException e) { LOG.debug("error getting robots.txt", e); } finally { httpClient.releaseConnection(response); IOUtils.close(out); } return ""; } private boolean isAllowed(RobotsTxt robotsTxt, URI uri) { RobotsDirectives robotsDirectives = robotsTxt.getDirectivesFor(LimeWireUtils.getHttpServer()); LOG.debugf("directives for {0}: {1}", uri, robotsDirectives); return robotsDirectives.allows(uri.getPath()); } private File createTmpFile(URI uri) throws IOException { File file = File.createTempFile(URIUtils.getCanonicalHost(uri), ".robots.txt"); file.deleteOnExit(); return file; } }