/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.listcrawler; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.PrintWriter; import java.net.InetAddress; import java.net.MalformedURLException; import java.net.Socket; import java.net.URL; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Enumeration; import java.util.HashSet; import java.util.Vector; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Semaphore; import javax.servlet.ServletConfig; import javax.servlet.ServletContext; import javax.servlet.ServletException; import javax.servlet.ServletRequest; import javax.servlet.ServletResponse; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.record.Buffer; import org.commoncrawl.io.NIOBufferList; import org.commoncrawl.io.NIOBufferListInputStream; import org.commoncrawl.io.NIOHttpConnection; import org.commoncrawl.io.NIOHttpHeaders; import org.commoncrawl.io.NIOHttpConnection.DataSource; import org.commoncrawl.io.NIOHttpConnection.State; import org.commoncrawl.protocol.CrawlURLMetadata; import org.commoncrawl.protocol.shared.ArcFileHeaderItem; import org.commoncrawl.rpc.base.shared.BinaryProtocol; import org.commoncrawl.service.listcrawler.DiskCacheItem; import org.commoncrawl.util.HttpCacheUtils; import org.commoncrawl.util.HttpCookieUtils; import org.commoncrawl.util.HttpHeaderInfoExtractor; import org.commoncrawl.util.URLFingerprint; import org.commoncrawl.util.URLUtils; import org.commoncrawl.util.HttpCacheUtils.LifeTimeInfo; import org.commoncrawl.util.HttpCookieUtils.CanonicalCookie; import org.commoncrawl.util.CCStringUtils; import org.mortbay.util.IO; /** * An experimental new version of the crawler cache serving servlet * * @author rana * */ public class ProxyServlet2 extends HttpServlet { private static final Log LOG = LogFactory.getLog(ProxyServlet.class); private int _tunnelTimeoutMs=3000; private ExecutorService _threadPool = Executors.newFixedThreadPool(100); private HttpCookieUtils.CookieStore _cookieStore = new HttpCookieUtils.CookieStore(); protected HashSet _DontProxyHeaders = new HashSet(); { _DontProxyHeaders.add("proxy-connection"); _DontProxyHeaders.add("connection"); _DontProxyHeaders.add("keep-alive"); _DontProxyHeaders.add("transfer-encoding"); _DontProxyHeaders.add("te"); _DontProxyHeaders.add("trailer"); _DontProxyHeaders.add("proxy-authorization"); _DontProxyHeaders.add("proxy-authenticate"); _DontProxyHeaders.add("upgrade"); _DontProxyHeaders.add("cache-control"); _DontProxyHeaders.add("pragma"); _DontProxyHeaders.add("last-modified"); _DontProxyHeaders.add("date"); _DontProxyHeaders.add("age"); _DontProxyHeaders.add("etag"); _DontProxyHeaders.add("expires"); _DontProxyHeaders.add("user-agent"); } private ServletConfig config; private ServletContext context; /* (non-Javadoc) * @see javax.servlet.Servlet#init(javax.servlet.ServletConfig) */ public void init(ServletConfig config) throws ServletException { this.config=config; this.context=config.getServletContext(); } /* (non-Javadoc) * @see javax.servlet.Servlet#getServletConfig() */ public ServletConfig getServletConfig() { return config; } private static File cachePathFromURL(URL theURL) throws MalformedURLException { String canonicalURL = URLUtils.canonicalizeURL(theURL.toString(),true); long fingerprint = URLFingerprint.generate64BitURLFPrint(canonicalURL); File cachePath = new File(ProxyServer.getSingleton().getDataDirectory(),"diskCache"); cachePath.mkdir(); File filePath = new File(cachePath,Long.toString(fingerprint)); return filePath; } public static class CacheLoadRequest { URL _theURL; public CacheLoadRequest(URL theURL) { _theURL = theURL; } public DiskCacheItem executeRequest() { try { // ok ... first construct file path to url ... File cacheFilePath = cachePathFromURL(_theURL); // now check to see if file exists ... if (cacheFilePath.exists() && cacheFilePath.isFile()) { // ok, we are running in the servlet thread context here ... so // it is ok to block on io requests directly ... FileInputStream inputStream = new FileInputStream(cacheFilePath); try { // load cache item from stream ... DataInputStream dataInput = new DataInputStream(inputStream); // load it DiskCacheItem item = new DiskCacheItem(); item.deserialize(dataInput,new BinaryProtocol()); return item; } finally { if (inputStream != null) { inputStream.close(); } } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } return null; } } public static class NIOConnectionWrapper implements NIOHttpConnection.Listener, DataSource { private Semaphore _blockingSemaphore = new Semaphore(0); NIOHttpConnection _connection; byte[] _uploadBuffer; boolean _connectionFailed = false; public NIOConnectionWrapper(NIOHttpConnection connection) { _connection = connection; _connection.setListener(this); } public void setUploadBuffer(byte[] buffer) { _uploadBuffer = buffer; } @Override public void HttpConnectionStateChanged(NIOHttpConnection theConnection,State oldState, State state) { if (state == State.DONE || state == State.ERROR) { _connectionFailed = (state == State.ERROR); _blockingSemaphore.release(); _connection.setListener(null); } } @Override public void HttpContentAvailable(NIOHttpConnection theConnection,NIOBufferList contentBuffer) { // NOOP } @Override public boolean read(NIOHttpConnection source,NIOBufferList dataBuffer) throws IOException { if (_uploadBuffer != null) { dataBuffer.write(_uploadBuffer, 0, _uploadBuffer.length); _uploadBuffer = null; } return true; } public boolean waitForCompletion() { _blockingSemaphore.acquireUninterruptibly(); return !_connectionFailed; } @Override public void finsihedWriting(NIOHttpConnection sourceConnection, ByteBuffer thisBuffer) throws IOException { // TODO Auto-generated method stub } } /** build a NIOHttpHeader object from the cahce file header item array * */ private static NIOHttpHeaders buildHeaderFromHeaderItems(ArrayList<ArcFileHeaderItem> items) { NIOHttpHeaders headers = new NIOHttpHeaders(); for (ArcFileHeaderItem item : items){ headers.add(item.getItemKey(), item.getItemValue()); } return headers; } private static class RequestDetails { public URL url; ArrayList<String> log = new ArrayList<String>(); @Override public String toString() { StringBuffer outputBuffer = new StringBuffer(); outputBuffer.append("URL:" + url.toString() + "\n"); for (String logline : log) { outputBuffer.append("--" + logline + "\n"); } return outputBuffer.toString(); } } public void serviceProxyInternalRequest(ServletRequest request,ServletResponse response) throws IOException { String uri=((HttpServletRequest)request).getRequestURI(); if (uri.equalsIgnoreCase("/dumpCookies")) { Vector<CanonicalCookie> cookies = new Vector<CanonicalCookie>(); // get a copy of all the cookie objects ... _cookieStore.GetAllCookies(cookies); PrintWriter writer = response.getWriter(); HttpServletResponse resp = (HttpServletResponse)response; resp.setStatus(200); resp.setContentType("text/html"); writer.println("<pre>"); for (CanonicalCookie cookie : cookies) { writer.println(cookie.toString()); } writer.println("</pre>"); } } /* (non-Javadoc) * @see javax.servlet.Servlet#service(javax.servlet.ServletRequest, javax.servlet.ServletResponse) */ public void service(ServletRequest req, ServletResponse res) throws ServletException, IOException { HttpServletRequest request = (HttpServletRequest)req; HttpServletResponse response = (HttpServletResponse)res; if ("CONNECT".equalsIgnoreCase(request.getMethod())) { handleConnect(request,response); } else { final RequestDetails details = new RequestDetails(); String uri=request.getRequestURI(); if (request.getQueryString()!=null) uri+="?"+request.getQueryString(); final URL url = new URL(request.getScheme(), request.getServerName(), request.getServerPort(), uri); if (request.getServerName().equals("proxy")) { serviceProxyInternalRequest(req,res); return; } // context.log("URL="+url); details.url = url; // attempt cache load first ... CacheLoadRequest cacheLoad = new CacheLoadRequest(url); details.log.add("Executing Disk Load Request"); DiskCacheItem cacheItem = cacheLoad.executeRequest(); details.log.add("Disk Load Request Returned:" + cacheItem); // create metadata placeholder CrawlURLMetadata metadata = new CrawlURLMetadata(); NIOHttpHeaders headers = null; boolean revalidate = false; boolean cacheItemValid = true; if (cacheItem != null) { // get headers headers = buildHeaderFromHeaderItems(cacheItem.getHeaderItems()); // set last fetch time in metadata metadata.setLastFetchTimestamp(cacheItem.getFetchTime()); // parse headers HttpHeaderInfoExtractor.parseHeaders(headers, metadata); // ok now validate cache if (HttpCacheUtils.requiresValidation(metadata)) { details.log.add("CACHE Item Present But Needs Revalidation"); revalidate = true; } } // if no cache item or we to revalidate cache item .. if (cacheItem == null || revalidate) { NIOHttpConnection connection = new NIOHttpConnection( url, ProxyServer.getSingleton().getEventLoop().getSelector(), ProxyServer.getSingleton().getEventLoop().getResolver(),_cookieStore); NIOConnectionWrapper wrapper = new NIOConnectionWrapper(connection); // URLConnection connection = url.openConnection(); // connection.setAllowUserInteraction(false); // Set method /* HttpURLConnection http = null; if (connection instanceof HttpURLConnection) { http = (HttpURLConnection)connection; http.setRequestMethod(request.getMethod()); http.setInstanceFollowRedirects(false); } */ connection.setMethod(request.getMethod()); // check connection header String connectionHdr = request.getHeader("Connection"); if (connectionHdr!=null) { connectionHdr=connectionHdr.toLowerCase(); if (connectionHdr.equals("keep-alive")|| connectionHdr.equals("close")) connectionHdr=null; } // copy headers boolean xForwardedFor=false; boolean hasContent=false; Enumeration enm = request.getHeaderNames(); while (enm.hasMoreElements()) { // TODO could be better than this! String hdr=(String)enm.nextElement(); String lhdr=hdr.toLowerCase(); if (_DontProxyHeaders.contains(lhdr) || lhdr.equals("cookie")) continue; if (connectionHdr!=null && connectionHdr.indexOf(lhdr)>=0) continue; if ("content-type".equals(lhdr)) hasContent=true; Enumeration vals = request.getHeaders(hdr); while (vals.hasMoreElements()) { String val = (String)vals.nextElement(); if (val!=null) { connection.getRequestHeaders().set(hdr, val); // connection.addRequestProperty(hdr,val); details.log.add("req header: "+hdr+": "+val); xForwardedFor|="X-Forwarded-For".equalsIgnoreCase(hdr); } } } String cookies = _cookieStore.GetCookies(url); if (cookies.length() != 0) { details.log.add("req injected-header: Cookie:" +cookies); connection.getRequestHeaders().set("Cookie", cookies); } // Proxy headers connection.getRequestHeaders().set("Via", "1.1 (jetty)"); // cache headers (if required) if (metadata.isFieldDirty(CrawlURLMetadata.Field_LASTMODIFIEDTIME)) { details.log.add("Sending If-Modified-Since"); connection.getRequestHeaders().set("If-Modified-Since",headers.findValue("Last-Modified")); } if (metadata.isFieldDirty(CrawlURLMetadata.Field_ETAG)) { details.log.add("Sending If-None-Match"); connection.getRequestHeaders().set("If-None-Match", metadata.getETag()); } if (!xForwardedFor) connection.getRequestHeaders().set("X-Forwarded-For",request.getRemoteAddr()); //connection.addRequestProperty("X-Forwarded-For",request.getRemoteAddr()); // a little bit of cache control String cache_control = request.getHeader("Cache-Control"); /* if (cache_control!=null && (cache_control.indexOf("no-cache")>=0 || cache_control.indexOf("no-store")>=0)) connection.setUseCaches(false); */ // customize Connection try { // connection.setDoInput(true); // do input thang! InputStream in=request.getInputStream(); if (hasContent) { //connection.setDoOutput(true); ByteArrayOutputStream stream = new ByteArrayOutputStream(); IO.copy(in,stream); wrapper.setUploadBuffer(stream.toByteArray()); } // Connect connection.open(); } catch (Exception e) { details.log.add(CCStringUtils.stringifyException(e)); } boolean connectionSucceeded = wrapper.waitForCompletion(); InputStream proxy_in = null; // handler status codes etc. int code=500; if (connectionSucceeded) { // set last fetch time in metadata metadata.setLastFetchTimestamp(System.currentTimeMillis()); code=connection.getResponseHeaders().getHttpResponseCode(); if (revalidate && code != 304) { details.log.add("Item ReValidate FAILED"); cacheItemValid = false; } if (code != 304) { HttpHeaderInfoExtractor.parseHeaders(connection.getResponseHeaders(), metadata); response.setStatus(code,""); details.log.add("response code:"+code); // clear response defaults. response.setHeader("Date",null); response.setHeader("Server",null); // set response headers int h=0; String hdr=connection.getResponseHeaders().getKey(h); String val=connection.getResponseHeaders().getValue(h); while(hdr!=null || val!=null) { String lhdr = hdr!=null?hdr.toLowerCase():null; if (hdr!=null && val!=null && !_DontProxyHeaders.contains(lhdr)) response.addHeader(hdr,val); details.log.add("response header:" +hdr+": "+val); h++; hdr=connection.getResponseHeaders().getKey(h); val=connection.getResponseHeaders().getValue(h); } response.addHeader("Via","1.1 (jetty)"); response.addHeader("cache-control","no-cache,no-store"); response.addHeader("Connection","close"); // IF RESULT IS CACHEABLE ... LifeTimeInfo lifeTimeInfo = HttpCacheUtils.getFreshnessLifetimeInMilliseconds(metadata); details.log.add("getFreshnessLifetime returned:" + lifeTimeInfo._lifetime); details.log.add("getFreshnessLifetime source:" + lifeTimeInfo._source); if (lifeTimeInfo._lifetime != 0) { details.log.add("item is cachable - issuing cache request"); // construct a disk cache item ... final DiskCacheItem cacheItemForWrite = new DiskCacheItem(); // populate cacheItemForWrite.setFetchTime(System.currentTimeMillis()); cacheItemForWrite.setResponseCode(code); // headers .. h=0; hdr=connection.getResponseHeaders().getKey(h); val=connection.getResponseHeaders().getValue(h); while(hdr!=null || val!=null) { String lhdr = hdr!=null?hdr.toLowerCase():null; if (hdr!=null && val!=null) { if (!hdr.toLowerCase().equals("set-cookie")) { ArcFileHeaderItem item = new ArcFileHeaderItem(); item.setItemKey(hdr); item.setItemValue(val); cacheItemForWrite.getHeaderItems().add(item); } } h++; hdr=connection.getResponseHeaders().getKey(h); val=connection.getResponseHeaders().getValue(h); } if (connection.getContentBuffer().available() != 0) { // copy result to byte array //VERY INEFFICIENT ... BUT ONLY FOR TESTING ... ByteArrayOutputStream tempStream = new ByteArrayOutputStream(); IO.copy(new NIOBufferListInputStream(connection.getContentBuffer()),tempStream); // get the underlying buffer byte[] responseBuffer = tempStream.toByteArray(); // set it into the cache item ... cacheItemForWrite.setContent(new Buffer(responseBuffer)); // and now write out buffer IO.copy(new ByteArrayInputStream(responseBuffer),response.getOutputStream()); } // ok schedule a disk cache write ... _threadPool.execute(new Runnable() { @Override public void run() { LOG.info("Writing Cache Item for URL:" + url); File cacheFileName; try { cacheFileName = cachePathFromURL(url); try { FileOutputStream fileStream = new FileOutputStream(cacheFileName); try { DataOutputStream dataOutputStream = new DataOutputStream(fileStream); cacheItemForWrite.serialize(dataOutputStream,new BinaryProtocol()); } finally { fileStream.close(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } catch (MalformedURLException e) { LOG.error(CCStringUtils.stringifyException(e)); } } }); } else { details.log.add("FRESHNESS LIFETIME == 0 - SKIPPING CACHE!"); // no cache direct copy case if (connection.getContentBuffer().available() != 0) { IO.copy(new NIOBufferListInputStream(connection.getContentBuffer()),response.getOutputStream()); } } } } else { response.setStatus(500,"Proxy Request Failed"); details.log.add("Proxy Request Failed"); } } // ok now, if cache item != null and cache-item is still valid if (cacheItem != null && cacheItemValid) { // service request from cache details.log.add("Servicing Request From Disk Cache"); // clear response defaults. response.setHeader("Date",null); response.setHeader("Server",null); // set response code response.setStatus(cacheItem.getResponseCode()); // set response headers for (ArcFileHeaderItem headerItem : cacheItem.getHeaderItems()) { String key = headerItem.getItemKey().toLowerCase(); // if not in don't proxy headers ... if (key.length() != 0) { if (!_DontProxyHeaders.contains(key) && !key.equals("set-cookie")) { response.addHeader(headerItem.getItemKey(),headerItem.getItemValue()); details.log.add("cache response: "+headerItem.getItemKey()+": "+headerItem.getItemValue()); } else { details.log.add("cache hidden-hdr: "+headerItem.getItemKey()+": "+headerItem.getItemValue()); } } } response.addHeader("Via","1.1 (jetty)"); response.addHeader("cache-control","no-cache,no-store"); response.addHeader("Connection","close"); if (cacheItem.getContent().getCount() != 0) { response.setHeader("Content-Length",null); response.addHeader("Content-Length",Integer.toString(cacheItem.getContent().getCount())); IO.copy(new ByteArrayInputStream(cacheItem.getContent().getReadOnlyBytes()),response.getOutputStream()); } } LOG.info(details.toString()); } } /* ------------------------------------------------------------ */ public void handleConnect(HttpServletRequest request, HttpServletResponse response) throws IOException { String uri = request.getRequestURI(); context.log("CONNECT: "+uri); // InetAddrPort addrPort=new InetAddrPort(uri); URL url = new URL(uri); InetAddress address = InetAddress.getByName(url.getHost()); int port = (url.getPort() != -1) ? url.getPort() : 80; //if (isForbidden(HttpMessage.__SSL_SCHEME,addrPort.getHost(),addrPort.getPort(),false)) //{ // sendForbid(request,response,uri); //} //else { InputStream in=request.getInputStream(); OutputStream out=response.getOutputStream(); Socket socket = new Socket(address,port); context.log("Socket: "+socket); response.setStatus(200); response.setHeader("Connection","close"); response.flushBuffer(); System.err.println(response); context.log("out<-in"); IO.copyThread(socket.getInputStream(),out); context.log("in->out"); IO.copy(in,socket.getOutputStream()); } } /* (non-Javadoc) * @see javax.servlet.Servlet#getServletInfo() */ public String getServletInfo() { return "Proxy Servlet"; } /* (non-Javadoc) * @see javax.servlet.Servlet#destroy() */ public void destroy() { } }