/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.service.listcrawler;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.Socket;
import java.net.URL;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Vector;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Semaphore;
import javax.servlet.ServletConfig;
import javax.servlet.ServletContext;
import javax.servlet.ServletException;
import javax.servlet.ServletRequest;
import javax.servlet.ServletResponse;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.record.Buffer;
import org.commoncrawl.io.NIOBufferList;
import org.commoncrawl.io.NIOBufferListInputStream;
import org.commoncrawl.io.NIOHttpConnection;
import org.commoncrawl.io.NIOHttpHeaders;
import org.commoncrawl.io.NIOHttpConnection.DataSource;
import org.commoncrawl.io.NIOHttpConnection.State;
import org.commoncrawl.protocol.CrawlURLMetadata;
import org.commoncrawl.protocol.shared.ArcFileHeaderItem;
import org.commoncrawl.rpc.base.shared.BinaryProtocol;
import org.commoncrawl.service.listcrawler.DiskCacheItem;
import org.commoncrawl.util.HttpCacheUtils;
import org.commoncrawl.util.HttpCookieUtils;
import org.commoncrawl.util.HttpHeaderInfoExtractor;
import org.commoncrawl.util.URLFingerprint;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.HttpCacheUtils.LifeTimeInfo;
import org.commoncrawl.util.HttpCookieUtils.CanonicalCookie;
import org.commoncrawl.util.CCStringUtils;
import org.mortbay.util.IO;
/**
* An experimental new version of the crawler cache serving servlet
*
* @author rana
*
*/
public class ProxyServlet2 extends HttpServlet
{
private static final Log LOG = LogFactory.getLog(ProxyServlet.class);
private int _tunnelTimeoutMs=3000;
private ExecutorService _threadPool = Executors.newFixedThreadPool(100);
private HttpCookieUtils.CookieStore _cookieStore = new HttpCookieUtils.CookieStore();
protected HashSet _DontProxyHeaders = new HashSet();
{
_DontProxyHeaders.add("proxy-connection");
_DontProxyHeaders.add("connection");
_DontProxyHeaders.add("keep-alive");
_DontProxyHeaders.add("transfer-encoding");
_DontProxyHeaders.add("te");
_DontProxyHeaders.add("trailer");
_DontProxyHeaders.add("proxy-authorization");
_DontProxyHeaders.add("proxy-authenticate");
_DontProxyHeaders.add("upgrade");
_DontProxyHeaders.add("cache-control");
_DontProxyHeaders.add("pragma");
_DontProxyHeaders.add("last-modified");
_DontProxyHeaders.add("date");
_DontProxyHeaders.add("age");
_DontProxyHeaders.add("etag");
_DontProxyHeaders.add("expires");
_DontProxyHeaders.add("user-agent");
}
private ServletConfig config;
private ServletContext context;
/* (non-Javadoc)
* @see javax.servlet.Servlet#init(javax.servlet.ServletConfig)
*/
public void init(ServletConfig config) throws ServletException
{
this.config=config;
this.context=config.getServletContext();
}
/* (non-Javadoc)
* @see javax.servlet.Servlet#getServletConfig()
*/
public ServletConfig getServletConfig()
{
return config;
}
private static File cachePathFromURL(URL theURL) throws MalformedURLException {
String canonicalURL = URLUtils.canonicalizeURL(theURL.toString(),true);
long fingerprint = URLFingerprint.generate64BitURLFPrint(canonicalURL);
File cachePath = new File(ProxyServer.getSingleton().getDataDirectory(),"diskCache");
cachePath.mkdir();
File filePath = new File(cachePath,Long.toString(fingerprint));
return filePath;
}
public static class CacheLoadRequest {
URL _theURL;
public CacheLoadRequest(URL theURL) {
_theURL = theURL;
}
public DiskCacheItem executeRequest() {
try {
// ok ... first construct file path to url ...
File cacheFilePath = cachePathFromURL(_theURL);
// now check to see if file exists ...
if (cacheFilePath.exists() && cacheFilePath.isFile()) {
// ok, we are running in the servlet thread context here ... so
// it is ok to block on io requests directly ...
FileInputStream inputStream = new FileInputStream(cacheFilePath);
try {
// load cache item from stream ...
DataInputStream dataInput = new DataInputStream(inputStream);
// load it
DiskCacheItem item = new DiskCacheItem();
item.deserialize(dataInput,new BinaryProtocol());
return item;
}
finally {
if (inputStream != null) {
inputStream.close();
}
}
}
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
return null;
}
}
public static class NIOConnectionWrapper implements NIOHttpConnection.Listener, DataSource {
private Semaphore _blockingSemaphore = new Semaphore(0);
NIOHttpConnection _connection;
byte[] _uploadBuffer;
boolean _connectionFailed = false;
public NIOConnectionWrapper(NIOHttpConnection connection) {
_connection = connection;
_connection.setListener(this);
}
public void setUploadBuffer(byte[] buffer) {
_uploadBuffer = buffer;
}
@Override
public void HttpConnectionStateChanged(NIOHttpConnection theConnection,State oldState, State state) {
if (state == State.DONE || state == State.ERROR) {
_connectionFailed = (state == State.ERROR);
_blockingSemaphore.release();
_connection.setListener(null);
}
}
@Override
public void HttpContentAvailable(NIOHttpConnection theConnection,NIOBufferList contentBuffer) {
// NOOP
}
@Override
public boolean read(NIOHttpConnection source,NIOBufferList dataBuffer) throws IOException {
if (_uploadBuffer != null) {
dataBuffer.write(_uploadBuffer, 0, _uploadBuffer.length);
_uploadBuffer = null;
}
return true;
}
public boolean waitForCompletion() {
_blockingSemaphore.acquireUninterruptibly();
return !_connectionFailed;
}
@Override
public void finsihedWriting(NIOHttpConnection sourceConnection,
ByteBuffer thisBuffer) throws IOException {
// TODO Auto-generated method stub
}
}
/** build a NIOHttpHeader object from the cahce file header item array
*
*/
private static NIOHttpHeaders buildHeaderFromHeaderItems(ArrayList<ArcFileHeaderItem> items) {
NIOHttpHeaders headers = new NIOHttpHeaders();
for (ArcFileHeaderItem item : items){
headers.add(item.getItemKey(), item.getItemValue());
}
return headers;
}
private static class RequestDetails {
public URL url;
ArrayList<String> log = new ArrayList<String>();
@Override
public String toString() {
StringBuffer outputBuffer = new StringBuffer();
outputBuffer.append("URL:" + url.toString() + "\n");
for (String logline : log) {
outputBuffer.append("--" + logline + "\n");
}
return outputBuffer.toString();
}
}
public void serviceProxyInternalRequest(ServletRequest request,ServletResponse response) throws IOException {
String uri=((HttpServletRequest)request).getRequestURI();
if (uri.equalsIgnoreCase("/dumpCookies")) {
Vector<CanonicalCookie> cookies = new Vector<CanonicalCookie>();
// get a copy of all the cookie objects ...
_cookieStore.GetAllCookies(cookies);
PrintWriter writer = response.getWriter();
HttpServletResponse resp = (HttpServletResponse)response;
resp.setStatus(200);
resp.setContentType("text/html");
writer.println("<pre>");
for (CanonicalCookie cookie : cookies) {
writer.println(cookie.toString());
}
writer.println("</pre>");
}
}
/* (non-Javadoc)
* @see javax.servlet.Servlet#service(javax.servlet.ServletRequest, javax.servlet.ServletResponse)
*/
public void service(ServletRequest req, ServletResponse res) throws ServletException,
IOException
{
HttpServletRequest request = (HttpServletRequest)req;
HttpServletResponse response = (HttpServletResponse)res;
if ("CONNECT".equalsIgnoreCase(request.getMethod()))
{
handleConnect(request,response);
}
else
{
final RequestDetails details = new RequestDetails();
String uri=request.getRequestURI();
if (request.getQueryString()!=null)
uri+="?"+request.getQueryString();
final URL url = new URL(request.getScheme(),
request.getServerName(),
request.getServerPort(),
uri);
if (request.getServerName().equals("proxy")) {
serviceProxyInternalRequest(req,res);
return;
}
// context.log("URL="+url);
details.url = url;
// attempt cache load first ...
CacheLoadRequest cacheLoad = new CacheLoadRequest(url);
details.log.add("Executing Disk Load Request");
DiskCacheItem cacheItem = cacheLoad.executeRequest();
details.log.add("Disk Load Request Returned:" + cacheItem);
// create metadata placeholder
CrawlURLMetadata metadata = new CrawlURLMetadata();
NIOHttpHeaders headers = null;
boolean revalidate = false;
boolean cacheItemValid = true;
if (cacheItem != null) {
// get headers
headers = buildHeaderFromHeaderItems(cacheItem.getHeaderItems());
// set last fetch time in metadata
metadata.setLastFetchTimestamp(cacheItem.getFetchTime());
// parse headers
HttpHeaderInfoExtractor.parseHeaders(headers, metadata);
// ok now validate cache
if (HttpCacheUtils.requiresValidation(metadata)) {
details.log.add("CACHE Item Present But Needs Revalidation");
revalidate = true;
}
}
// if no cache item or we to revalidate cache item ..
if (cacheItem == null || revalidate) {
NIOHttpConnection connection = new NIOHttpConnection(
url,
ProxyServer.getSingleton().getEventLoop().getSelector(),
ProxyServer.getSingleton().getEventLoop().getResolver(),_cookieStore);
NIOConnectionWrapper wrapper = new NIOConnectionWrapper(connection);
// URLConnection connection = url.openConnection();
// connection.setAllowUserInteraction(false);
// Set method
/*
HttpURLConnection http = null;
if (connection instanceof HttpURLConnection)
{
http = (HttpURLConnection)connection;
http.setRequestMethod(request.getMethod());
http.setInstanceFollowRedirects(false);
}
*/
connection.setMethod(request.getMethod());
// check connection header
String connectionHdr = request.getHeader("Connection");
if (connectionHdr!=null)
{
connectionHdr=connectionHdr.toLowerCase();
if (connectionHdr.equals("keep-alive")||
connectionHdr.equals("close"))
connectionHdr=null;
}
// copy headers
boolean xForwardedFor=false;
boolean hasContent=false;
Enumeration enm = request.getHeaderNames();
while (enm.hasMoreElements())
{
// TODO could be better than this!
String hdr=(String)enm.nextElement();
String lhdr=hdr.toLowerCase();
if (_DontProxyHeaders.contains(lhdr) || lhdr.equals("cookie"))
continue;
if (connectionHdr!=null && connectionHdr.indexOf(lhdr)>=0)
continue;
if ("content-type".equals(lhdr))
hasContent=true;
Enumeration vals = request.getHeaders(hdr);
while (vals.hasMoreElements())
{
String val = (String)vals.nextElement();
if (val!=null)
{
connection.getRequestHeaders().set(hdr, val);
// connection.addRequestProperty(hdr,val);
details.log.add("req header: "+hdr+": "+val);
xForwardedFor|="X-Forwarded-For".equalsIgnoreCase(hdr);
}
}
}
String cookies = _cookieStore.GetCookies(url);
if (cookies.length() != 0) {
details.log.add("req injected-header: Cookie:" +cookies);
connection.getRequestHeaders().set("Cookie", cookies);
}
// Proxy headers
connection.getRequestHeaders().set("Via", "1.1 (jetty)");
// cache headers (if required)
if (metadata.isFieldDirty(CrawlURLMetadata.Field_LASTMODIFIEDTIME)) {
details.log.add("Sending If-Modified-Since");
connection.getRequestHeaders().set("If-Modified-Since",headers.findValue("Last-Modified"));
}
if (metadata.isFieldDirty(CrawlURLMetadata.Field_ETAG)) {
details.log.add("Sending If-None-Match");
connection.getRequestHeaders().set("If-None-Match", metadata.getETag());
}
if (!xForwardedFor)
connection.getRequestHeaders().set("X-Forwarded-For",request.getRemoteAddr());
//connection.addRequestProperty("X-Forwarded-For",request.getRemoteAddr());
// a little bit of cache control
String cache_control = request.getHeader("Cache-Control");
/*
if (cache_control!=null &&
(cache_control.indexOf("no-cache")>=0 ||
cache_control.indexOf("no-store")>=0))
connection.setUseCaches(false);
*/
// customize Connection
try
{
// connection.setDoInput(true);
// do input thang!
InputStream in=request.getInputStream();
if (hasContent)
{
//connection.setDoOutput(true);
ByteArrayOutputStream stream = new ByteArrayOutputStream();
IO.copy(in,stream);
wrapper.setUploadBuffer(stream.toByteArray());
}
// Connect
connection.open();
}
catch (Exception e)
{
details.log.add(CCStringUtils.stringifyException(e));
}
boolean connectionSucceeded = wrapper.waitForCompletion();
InputStream proxy_in = null;
// handler status codes etc.
int code=500;
if (connectionSucceeded) {
// set last fetch time in metadata
metadata.setLastFetchTimestamp(System.currentTimeMillis());
code=connection.getResponseHeaders().getHttpResponseCode();
if (revalidate && code != 304) {
details.log.add("Item ReValidate FAILED");
cacheItemValid = false;
}
if (code != 304) {
HttpHeaderInfoExtractor.parseHeaders(connection.getResponseHeaders(), metadata);
response.setStatus(code,"");
details.log.add("response code:"+code);
// clear response defaults.
response.setHeader("Date",null);
response.setHeader("Server",null);
// set response headers
int h=0;
String hdr=connection.getResponseHeaders().getKey(h);
String val=connection.getResponseHeaders().getValue(h);
while(hdr!=null || val!=null)
{
String lhdr = hdr!=null?hdr.toLowerCase():null;
if (hdr!=null && val!=null && !_DontProxyHeaders.contains(lhdr))
response.addHeader(hdr,val);
details.log.add("response header:" +hdr+": "+val);
h++;
hdr=connection.getResponseHeaders().getKey(h);
val=connection.getResponseHeaders().getValue(h);
}
response.addHeader("Via","1.1 (jetty)");
response.addHeader("cache-control","no-cache,no-store");
response.addHeader("Connection","close");
// IF RESULT IS CACHEABLE ...
LifeTimeInfo lifeTimeInfo = HttpCacheUtils.getFreshnessLifetimeInMilliseconds(metadata);
details.log.add("getFreshnessLifetime returned:" + lifeTimeInfo._lifetime);
details.log.add("getFreshnessLifetime source:" + lifeTimeInfo._source);
if (lifeTimeInfo._lifetime != 0) {
details.log.add("item is cachable - issuing cache request");
// construct a disk cache item ...
final DiskCacheItem cacheItemForWrite = new DiskCacheItem();
// populate
cacheItemForWrite.setFetchTime(System.currentTimeMillis());
cacheItemForWrite.setResponseCode(code);
// headers ..
h=0;
hdr=connection.getResponseHeaders().getKey(h);
val=connection.getResponseHeaders().getValue(h);
while(hdr!=null || val!=null)
{
String lhdr = hdr!=null?hdr.toLowerCase():null;
if (hdr!=null && val!=null) {
if (!hdr.toLowerCase().equals("set-cookie")) {
ArcFileHeaderItem item = new ArcFileHeaderItem();
item.setItemKey(hdr);
item.setItemValue(val);
cacheItemForWrite.getHeaderItems().add(item);
}
}
h++;
hdr=connection.getResponseHeaders().getKey(h);
val=connection.getResponseHeaders().getValue(h);
}
if (connection.getContentBuffer().available() != 0) {
// copy result to byte array
//VERY INEFFICIENT ... BUT ONLY FOR TESTING ...
ByteArrayOutputStream tempStream = new ByteArrayOutputStream();
IO.copy(new NIOBufferListInputStream(connection.getContentBuffer()),tempStream);
// get the underlying buffer
byte[] responseBuffer = tempStream.toByteArray();
// set it into the cache item ...
cacheItemForWrite.setContent(new Buffer(responseBuffer));
// and now write out buffer
IO.copy(new ByteArrayInputStream(responseBuffer),response.getOutputStream());
}
// ok schedule a disk cache write ...
_threadPool.execute(new Runnable() {
@Override
public void run() {
LOG.info("Writing Cache Item for URL:" + url);
File cacheFileName;
try {
cacheFileName = cachePathFromURL(url);
try {
FileOutputStream fileStream = new FileOutputStream(cacheFileName);
try {
DataOutputStream dataOutputStream = new DataOutputStream(fileStream);
cacheItemForWrite.serialize(dataOutputStream,new BinaryProtocol());
}
finally {
fileStream.close();
}
} catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
} catch (MalformedURLException e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
});
}
else {
details.log.add("FRESHNESS LIFETIME == 0 - SKIPPING CACHE!");
// no cache direct copy case
if (connection.getContentBuffer().available() != 0) {
IO.copy(new NIOBufferListInputStream(connection.getContentBuffer()),response.getOutputStream());
}
}
}
}
else {
response.setStatus(500,"Proxy Request Failed");
details.log.add("Proxy Request Failed");
}
}
// ok now, if cache item != null and cache-item is still valid
if (cacheItem != null && cacheItemValid) {
// service request from cache
details.log.add("Servicing Request From Disk Cache");
// clear response defaults.
response.setHeader("Date",null);
response.setHeader("Server",null);
// set response code
response.setStatus(cacheItem.getResponseCode());
// set response headers
for (ArcFileHeaderItem headerItem : cacheItem.getHeaderItems()) {
String key = headerItem.getItemKey().toLowerCase();
// if not in don't proxy headers ...
if (key.length() != 0) {
if (!_DontProxyHeaders.contains(key) && !key.equals("set-cookie")) {
response.addHeader(headerItem.getItemKey(),headerItem.getItemValue());
details.log.add("cache response: "+headerItem.getItemKey()+": "+headerItem.getItemValue());
}
else {
details.log.add("cache hidden-hdr: "+headerItem.getItemKey()+": "+headerItem.getItemValue());
}
}
}
response.addHeader("Via","1.1 (jetty)");
response.addHeader("cache-control","no-cache,no-store");
response.addHeader("Connection","close");
if (cacheItem.getContent().getCount() != 0) {
response.setHeader("Content-Length",null);
response.addHeader("Content-Length",Integer.toString(cacheItem.getContent().getCount()));
IO.copy(new ByteArrayInputStream(cacheItem.getContent().getReadOnlyBytes()),response.getOutputStream());
}
}
LOG.info(details.toString());
}
}
/* ------------------------------------------------------------ */
public void handleConnect(HttpServletRequest request,
HttpServletResponse response)
throws IOException
{
String uri = request.getRequestURI();
context.log("CONNECT: "+uri);
// InetAddrPort addrPort=new InetAddrPort(uri);
URL url = new URL(uri);
InetAddress address = InetAddress.getByName(url.getHost());
int port = (url.getPort() != -1) ? url.getPort() : 80;
//if (isForbidden(HttpMessage.__SSL_SCHEME,addrPort.getHost(),addrPort.getPort(),false))
//{
// sendForbid(request,response,uri);
//}
//else
{
InputStream in=request.getInputStream();
OutputStream out=response.getOutputStream();
Socket socket = new Socket(address,port);
context.log("Socket: "+socket);
response.setStatus(200);
response.setHeader("Connection","close");
response.flushBuffer();
System.err.println(response);
context.log("out<-in");
IO.copyThread(socket.getInputStream(),out);
context.log("in->out");
IO.copy(in,socket.getOutputStream());
}
}
/* (non-Javadoc)
* @see javax.servlet.Servlet#getServletInfo()
*/
public String getServletInfo()
{
return "Proxy Servlet";
}
/* (non-Javadoc)
* @see javax.servlet.Servlet#destroy()
*/
public void destroy()
{
}
}