package org.archive.wayback.accesscontrol.robotstxt.redis;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Date;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.archive.wayback.accesscontrol.ExclusionFilterFactory;
import org.archive.wayback.accesscontrol.robotstxt.redis.SimpleRedisRobotsCache.RobotsResult;
import org.archive.wayback.util.webapp.AbstractRequestHandler;
//TODO: Add a proper jsp/view for this
//This is a simple prototype of the update-robots mechanism
public class UpdateRobotsRequestHandler extends AbstractRequestHandler {
protected final static String HTTP_PREFIX = "http://";
protected final static String HTTPS_PREFIX = "https://";
protected final static String WWW_PREFIX = "www.";
protected final static String HTTP_WWW_PREFIX = HTTP_PREFIX + WWW_PREFIX;
protected final static String HTTPS_WWW_PREFIX = HTTPS_PREFIX + WWW_PREFIX;
protected final static String ROBOT_SUFFIX = "/robots.txt";
private SimpleRedisRobotsCache robotsCache;
// Minimum time (secs) between subsequent forced updates
// Default: off for now
private int minUpdateTime = 0;
public int getMinUpdateTime() {
return minUpdateTime;
}
public void setMinUpdateTime(int minUpdateTime) {
this.minUpdateTime = minUpdateTime;
}
public SimpleRedisRobotsCache getRobotsCache() {
return robotsCache;
}
public void setRobotsCache(SimpleRedisRobotsCache robotsCache) {
this.robotsCache = robotsCache;
}
@Override
public boolean handleRequest(HttpServletRequest httpRequest,
HttpServletResponse httpResponse) throws ServletException,
IOException {
String url = this.translateRequestPath(httpRequest);
PrintWriter writer = httpResponse.getWriter();
if (!url.endsWith(ROBOT_SUFFIX)) {
httpResponse.setContentType("text/plain");
httpResponse.getWriter().println("The request must end in /robots.txt");
return true;
}
httpResponse.setContentType("text/html");
writer.println("<html><body><h2>Wayback Robots Updater</h2>");
if (!url.endsWith(ROBOT_SUFFIX)) {
writer.println("<p>URL to update (<code>" + url + "</code>) must end in /robots.txt</p>");
} else if (robotsCache == null) {
writer.println("No Robots Cache Set");
} else {
// if (url.startsWith(HTTP_WWW_PREFIX) || url.startsWith(HTTPS_WWW_PREFIX)) {
// //
// } else if (url.startsWith(WWW_PREFIX)) {
// url = HTTP_PREFIX + url;
// } else if (url.startsWith(HTTP_PREFIX)) {
// url = HTTP_WWW_PREFIX + url.substring(HTTP_PREFIX.length());
// } else if (url.startsWith(HTTPS_PREFIX)) {
// url = HTTPS_WWW_PREFIX + url.substring(HTTPS_PREFIX.length());
// } else {
// url = HTTP_WWW_PREFIX + url;
// }
if (!url.startsWith(HTTP_PREFIX) && !url.startsWith(HTTPS_PREFIX)) {
url = HTTP_PREFIX + url;
}
//RobotsContext context = robotsCache.forceUpdate(url, minUpdateTime);
RobotsResult result = robotsCache.forceUpdate(url, minUpdateTime, false);
if (result == null) {
writer.println("<p>Error Updating Robots (see logs)</p>");
return true;
}
if (!result.isSameRobots()) {
writer.println("<b>UPDATED Robots</b>");
writer.println("<p><i>Old Robots:</i></p>");
writer.println("<pre>" + result.oldRobots + "</pre>");
writer.println("<p><i>NEW Updated Robots:</i></p>");
} else {
writer.println("<b>Robots Unchanged</b>");
writer.println("<p><i>Current Robots:</i></p>");
}
writer.print("<pre>");
if (result.robots != null && result.status == 200) {
writer.print(result.robots);
} else {
writer.print("No Valid Robots Found: Status " + result.status);
}
writer.println("</pre>");
}
writer.println("<p><i>Current Time: " + new Date().toString() + "</p></body></html>");
return true;
}
}