/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.listcrawler; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.net.MalformedURLException; import java.net.URLDecoder; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.concurrent.Semaphore; import javax.servlet.ServletException; import javax.servlet.ServletOutputStream; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.record.Buffer; import org.commoncrawl.util.GZIPUtils; import org.apache.tools.ant.filters.StringInputStream; import org.commoncrawl.async.Timer; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.io.NIOHttpConnection; import org.commoncrawl.io.NIOHttpHeaders; import org.commoncrawl.protocol.CacheItem; import org.commoncrawl.protocol.CrawlURL; import org.commoncrawl.protocol.CrawlURLMetadata; import org.commoncrawl.protocol.shared.ArcFileHeaderItem; import org.commoncrawl.protocol.shared.ArcFileItem; import org.commoncrawl.rpc.base.internal.AsyncRequest; import org.commoncrawl.rpc.base.shared.RPCException; import org.commoncrawl.service.crawler.CrawlItemStatusCallback; import org.commoncrawl.service.crawler.CrawlTarget; import org.commoncrawl.service.listcrawler.CacheManager.CacheItemCheckCallback; import org.commoncrawl.service.queryserver.ContentQueryRPCInfo; import org.commoncrawl.service.queryserver.ContentQueryRPCResult; import org.commoncrawl.util.GoogleURL; import org.commoncrawl.util.HttpHeaderInfoExtractor; import org.commoncrawl.util.URLFingerprint; import org.commoncrawl.util.URLUtils; import org.commoncrawl.util.ArcFileItemUtils; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.CharsetUtils; import org.commoncrawl.util.FlexBuffer; import org.commoncrawl.util.GZIPUtils.UnzipResult; import org.commoncrawl.util.Tuples.Pair; import com.google.common.collect.ImmutableSet; /** * Servlet that serves cached content via the crawler's cache * * @author rana * */ public class ProxyServlet extends HttpServlet { private static final Log LOG = LogFactory.getLog(ProxyServlet.class); private static final String PROXY_HEADER_SOURCE="x-ccproxy-source"; private static final String PROXY_HEADER_ORIG_STATUS="x-ccproxy-original-status"; private static final String PROXY_HEADER_TIMER="x-ccproxy-timer"; private static final String PROXY_HEADER_FINALURL="x-ccproxy-final-url"; private static final String PROXY_HEADER_TRUNCATION="x-ccproxy-truncated"; private static final String PROXY_HEADER_ORIGINAL_CONTENT_LEN="x-ccproxy-orig-content-len"; private static final String PROXY_RENDER_TYPE_TEXT = "text"; private static final String PROXY_RENDER_TYPE_NONE = "none"; private static class AsyncResponse { public enum ResponseType { HTTPErrorResponse, CacheItemResponse, CrawlURLResponse, S3Response } private long _startTime = System.currentTimeMillis(); private int _httpErrorCode = 400; private String _httpErrorCodeDesc = ""; private ResponseType _responseType = ResponseType.HTTPErrorResponse; private CacheItem _cacheItem = null; private CrawlURL _urlItem = null; private ArcFileItem _arcFileItem = null; private boolean _isCrawlComplete = false; public ResponseType getResponseType() { return _responseType; } public CacheItem getCacheItem() { return _cacheItem; } public ArcFileItem getArcFileItem() { return _arcFileItem; } public CrawlURL getCrawlURL() { return _urlItem; } public int getHttpErrorCode() { return _httpErrorCode; } public String getHttpErrorDesc() { return _httpErrorCodeDesc; } public synchronized boolean isCrawlComplete() { return _isCrawlComplete; } public synchronized void setCrawlComplete(boolean isComplete) { _isCrawlComplete = isComplete; } public void setStartTime(long startTime) { _startTime = startTime; } public long getStartTime() { return _startTime; } public void setCacheItemRespone(CacheItem item) { _responseType = ResponseType.CacheItemResponse; _cacheItem = item; } public void setS3ItemResponse(ArcFileItem item){ _responseType = ResponseType.S3Response; _arcFileItem = item; } public void setURLItemRespone(CrawlURL item) { _responseType = ResponseType.CrawlURLResponse; _urlItem = item; } public void setHttpErrorResponse(int httpErrorCode,String httpErrorResponse) { _responseType = ResponseType.HTTPErrorResponse; _httpErrorCode = httpErrorCode; _httpErrorCodeDesc = httpErrorResponse; } }; public ProxyServlet() { } private static ArrayList<ArcFileHeaderItem> populateHeaders(String headerData){ ArrayList<ArcFileHeaderItem> headerItems = new ArrayList<ArcFileHeaderItem>(); BufferedReader reader = new BufferedReader(new InputStreamReader(new StringInputStream(headerData))); String line = null; try { while ((line = reader.readLine()) != null) { if (line.length() != 0) { int colonPos = line.indexOf(':'); ArcFileHeaderItem item = new ArcFileHeaderItem(); if (colonPos != -1 && colonPos != line.length() - 1) { item.setItemKey(line.substring(0,colonPos)); item.setItemValue(line.substring(colonPos + 1)); } else { item.setItemValue(line); } headerItems.add(item); } } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } return headerItems; } private static void cacheS3ItemResult(ArcFileItem itemResult,String targetURL,long fingerpint) { CacheItem cacheItem = new CacheItem(); cacheItem.setUrlFingerprint(fingerpint); cacheItem.setUrl(targetURL); cacheItem.setSource((byte)CacheItem.Source.S3Cache); cacheItem.setHeaderItems(itemResult.getHeaderItems()); cacheItem.setFieldDirty(CacheItem.Field_HEADERITEMS); cacheItem.setContent(new Buffer(itemResult.getContent().getReadOnlyBytes(),0,itemResult.getContent().getCount())); if ((itemResult.getFlags() & ArcFileItem.Flags.TruncatedInDownload) != 0) { cacheItem.setFlags(cacheItem.getFlags() | CacheItem.Flags.Flag_WasTruncatedDuringDownload); } if ((itemResult.getFlags() & ArcFileItem.Flags.TruncatedInInflate) != 0) { cacheItem.setFlags(cacheItem.getFlags() | CacheItem.Flags.Flag_WasTruncatedDuringInflate); } ProxyServer.getSingleton().getCache().cacheItem(cacheItem,null); } /** * Calculate the number of IO operations requires to cache a given CrawlURL */ public static int calculateCachedItemCountGivenCrawlURL(CrawlURL urlObject) { int cachedItemCount = 0; try { if ((urlObject.getFlags() & CrawlURL.Flags.IsRedirected) != 0) { String originalCanonicalURL = URLUtils.canonicalizeURL(urlObject.getUrl(),true); String redirectCanonicalURL = URLUtils.canonicalizeURL(urlObject.getRedirectURL(),true); if (!originalCanonicalURL.equals(redirectCanonicalURL)) { cachedItemCount++; } } if (urlObject.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) { cachedItemCount++; } } catch (IOException e) { LOG.error("Encountered Exception while calculating cachedItemCount:" + CCStringUtils.stringifyException(e)); } return cachedItemCount; } /** * Process a CrawlURL object, and inject any valid contents into the cache * @param urlResult - the CrawlURL object containing crawl result * @param completionSempahore - a completion semaphore that will be released an appropriate number of times * when IO operations complete - SEE calculateCachedItemCountGivenCrawlURL */ public static void cacheCrawlURLResult(CrawlURL urlResult,Semaphore optionalCompletionSempahore) { try { // first check to see this was a redirect ... if ((urlResult.getFlags() & CrawlURL.Flags.IsRedirected) != 0) { // check to see if canonical urls are the same String originalCanonicalURL = URLUtils.canonicalizeURL(urlResult.getUrl(),true); String redirectCanonicalURL = URLUtils.canonicalizeURL(urlResult.getRedirectURL(),true); if (!originalCanonicalURL.equals(redirectCanonicalURL)) { // try to cache the redirect ... CacheItem cacheItem = new CacheItem(); cacheItem.setUrlFingerprint(urlResult.getFingerprint()); cacheItem.setUrl(URLUtils.canonicalizeURL(urlResult.getUrl(),true)); cacheItem.setFinalURL(urlResult.getRedirectURL()); cacheItem.setSource((byte)CacheItem.Source.WebRequest); cacheItem.setHeaderItems(populateHeaders(urlResult.getOriginalHeaders())); cacheItem.setFieldDirty(CacheItem.Field_HEADERITEMS); switch (urlResult.getOriginalResultCode()) { case 301: cacheItem.setFlags((byte)CacheItem.Flags.Flag_IsPermanentRedirect);break; default: cacheItem.setFlags((byte)CacheItem.Flags.Flag_IsTemporaryRedirect);break; } if ((urlResult.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) !=0) { cacheItem.setFlags(cacheItem.getFlags() | CacheItem.Flags.Flag_WasTruncatedDuringDownload); } //LOG.info("### CACHING Item:" + cacheItem.getUrl()); ProxyServer.getSingleton().getCache().cacheItem(cacheItem,optionalCompletionSempahore); } } if (urlResult.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) { CacheItem cacheItem = new CacheItem(); boolean isRedirect = (urlResult.getFlags() & CrawlURL.Flags.IsRedirected) != 0; String cannonicalURL = URLUtils.canonicalizeURL((isRedirect) ? urlResult.getRedirectURL() : urlResult.getUrl(),true); cacheItem.setUrl(cannonicalURL); cacheItem.setUrlFingerprint(URLFingerprint.generate64BitURLFPrint(cannonicalURL)); cacheItem.setSource((byte)CacheItem.Source.WebRequest); cacheItem.setFieldDirty(CacheItem.Field_HEADERITEMS); cacheItem.setHeaderItems(populateHeaders(urlResult.getHeaders())); // detect content encoding for (ArcFileHeaderItem headerItem : cacheItem.getHeaderItems()) { if (headerItem.getItemKey().equalsIgnoreCase("content-encoding")) { if (headerItem.getItemValue().equalsIgnoreCase("gzip") || headerItem.getItemValue().equalsIgnoreCase("deflate")) { // set compressed flag cacheItem.setFlags((byte)(cacheItem.getFlags() | CacheItem.Flags.Flag_IsCompressed)); } break; } } cacheItem.setContent(new FlexBuffer(urlResult.getContentRaw().getReadOnlyBytes(),0,urlResult.getContentRaw().getCount())); //LOG.info("### CACHING Item:" + cacheItem.getUrl()); ProxyServer.getSingleton().getCache().cacheItem(cacheItem,optionalCompletionSempahore); } } catch (MalformedURLException e) { LOG.error(CCStringUtils.stringifyException(e)); } } private static void addHeaderItem(ArrayList<ArcFileHeaderItem> items,String name,String value) { ArcFileHeaderItem item = new ArcFileHeaderItem(); item.setItemKey(name); item.setItemValue(value); items.add(1,item); } private static void removeHeaderItem(ArrayList<ArcFileHeaderItem> items,String name) { for (int i=0;i<items.size();++i) { if (items.get(i).getItemKey().equalsIgnoreCase(name)) { items.remove(i); break; } } } private static ImmutableSet<String> dontProxyHeaders = ImmutableSet.of( "proxy-connection", "connection", "keep-alive", "transfer-encoding", "te", "trailer", "proxy-authorization", "proxy-authenticate", "upgrade", "content-length", "content-encoding" ); private static BufferedReader readerForCharset(NIOHttpHeaders headers,byte[] content,int contentLength,PrintWriter debugWriter)throws IOException { CrawlURLMetadata metadata = new CrawlURLMetadata(); HttpHeaderInfoExtractor.parseHeaders(headers, metadata); String charset = metadata.getCharset(); if (charset.length() !=0 ) { debugWriter.println("***** Charset(via HttpHeaders):" + charset); } else { Pair<Integer,Charset> charsetTuple = CharsetUtils.bestEffortDetectCharset(headers.toString(),content,0,contentLength); if (charsetTuple != null) { charset = charsetTuple.e1.toString(); debugWriter.println("***** Charset(via HTML MetaTag):" + charset); } } if (charset == null || charset.length() == 0) { charset = "ASCII"; debugWriter.println("***** Charset(NotFount-UsingDefault):ASCII"); } Charset charsetObj = Charset.forName(charset); if (charsetObj == null) { debugWriter.println("***** Could Not Create CharsetDecoder for charset:" + charset); LOG.info("Unable to create Charsetcharset. Using ASCII"); charsetObj = Charset.forName("ASCII"); } debugWriter.println("***** Content:"); return new BufferedReader(new InputStreamReader(new ByteArrayInputStream(content,0,contentLength))); } private static void sendS3ItemResponse(final HttpServletRequest req, final HttpServletResponse response,ArcFileItem responseItem,String renderAs,AsyncResponse responseObject,long requestStartTime)throws IOException { CacheItem cacheItem = new CacheItem(); // populate a cache item object ... cacheItem.setHeaderItems(responseItem.getHeaderItems()); cacheItem.setFieldDirty(CacheItem.Field_HEADERITEMS); cacheItem.setUrl(responseItem.getUri()); cacheItem.setUrlFingerprint(URLUtils.getCanonicalURLFingerprint(responseItem.getUri(),true)); cacheItem.setSource((byte)CacheItem.Source.S3Cache); cacheItem.setContent(new Buffer(responseItem.getContent().getReadOnlyBytes(),0,responseItem.getContent().getCount())); sendCacheItemResponse(req,response,cacheItem,true,renderAs,responseObject,requestStartTime); } private static void sendCacheItemResponse(final HttpServletRequest req, final HttpServletResponse response,CacheItem responseItem,boolean isS3Response,String renderAs,AsyncResponse responseObject,long requestStartTime)throws IOException { // remove default headers ... response.setHeader("Date",null); response.setHeader("Server",null); // parse response code in headers ... CrawlURLMetadata metadata = new CrawlURLMetadata(); HttpHeaderInfoExtractor.parseStatusLine(responseItem.getHeaderItems().get(0).getItemValue(), metadata); if (!metadata.isFieldDirty(CrawlURLMetadata.Field_HTTPRESULTCODE)) { metadata.setHttpResultCode(200); } // set the result code ... response.setStatus(metadata.getHttpResultCode()); if (renderAs.equals(PROXY_RENDER_TYPE_TEXT)) { response.setHeader("content-type", "text/plain"); PrintWriter writer = response.getWriter(); writer.write(responseItem.getHeaderItems().get(0).getItemValue() + "\n"); if (isS3Response) writer.write(PROXY_HEADER_SOURCE+":s3\n"); else writer.write(PROXY_HEADER_SOURCE+":cache\n"); writer.write(PROXY_HEADER_TIMER+":" + (System.currentTimeMillis() - requestStartTime) +"MS\n"); writer.write(PROXY_HEADER_FINALURL+":" + responseItem.getFinalURL() + "\n"); writer.write("content-length:" + Integer.toString(responseItem.getContent().getCount()) + "\n"); if ((responseItem.getFlags() & CacheItem.Flags.Flag_IsCompressed) != 0) { writer.write("content-encoding:gzip\n"); } String truncationFlags = ""; if ((responseItem.getFlags() & CacheItem.Flags.Flag_WasTruncatedDuringDownload) != 0) { truncationFlags += ArcFileItem.Flags.toString(ArcFileItem.Flags.TruncatedInDownload); } if ((responseItem.getFlags() & CacheItem.Flags.Flag_WasTruncatedDuringInflate) != 0) { if (truncationFlags.length() !=0) truncationFlags +=","; truncationFlags += ArcFileItem.Flags.toString(ArcFileItem.Flags.TruncatedInInflate); } if (truncationFlags.length() !=0) { writer.write(PROXY_HEADER_TRUNCATION + ":" + truncationFlags + "\n"); } // iterate items for (ArcFileHeaderItem headerItem : responseItem.getHeaderItems()) { // ignore unwanted items if (headerItem.getItemKey().length() != 0) { if (headerItem.getItemValue().length() !=0) { if (!dontProxyHeaders.contains(headerItem.getItemKey().toLowerCase())) { // and send other ones through writer.write(headerItem.getItemKey()+ ":"+headerItem.getItemValue()+"\n"); } else { if (headerItem.getItemKey().equalsIgnoreCase("content-length")) { writer.write(PROXY_HEADER_ORIGINAL_CONTENT_LEN+":"+headerItem.getItemValue()+"\n"); } } } } } writer.write("\n"); int contentLength = responseItem.getContent().getCount(); byte contentData[] = responseItem.getContent().getReadOnlyBytes(); if ((responseItem.getFlags() & CacheItem.Flags.Flag_IsCompressed) != 0) { UnzipResult result = GZIPUtils.unzipBestEffort(contentData,CrawlEnvironment.CONTENT_SIZE_LIMIT); if (result != null) { contentData = result.data.get(); contentLength = result.data.getCount(); } } NIOHttpHeaders headers = ArcFileItemUtils.buildHeaderFromArcFileItemHeaders(responseItem.getHeaderItems()); BufferedReader bufferedReader = readerForCharset(headers, contentData, contentLength, writer); try { String line = null; while ((line = bufferedReader.readLine()) != null) { writer.println(line); } } finally { bufferedReader.close(); } writer.flush(); } else { // set the content length ... response.setHeader("content-length", Integer.toString(responseItem.getContent().getCount())); if ((responseItem.getFlags() & CacheItem.Flags.Flag_IsCompressed) != 0) { response.setHeader("content-encoding","gzip"); } if (isS3Response) response.setHeader(PROXY_HEADER_SOURCE,"s3"); else response.setHeader(PROXY_HEADER_SOURCE,"cache"); response.setHeader(PROXY_HEADER_TIMER,(System.currentTimeMillis() - requestStartTime) +"MS"); response.setHeader(PROXY_HEADER_FINALURL,responseItem.getFinalURL()); String truncationFlags = ""; if ((responseItem.getFlags() & CacheItem.Flags.Flag_WasTruncatedDuringDownload) != 0) { truncationFlags += ArcFileItem.Flags.toString(ArcFileItem.Flags.TruncatedInDownload); } if ((responseItem.getFlags() & CacheItem.Flags.Flag_WasTruncatedDuringInflate) != 0) { if (truncationFlags.length() !=0) truncationFlags +=","; truncationFlags += ArcFileItem.Flags.toString(ArcFileItem.Flags.TruncatedInInflate); } if (truncationFlags.length() !=0) { response.setHeader(PROXY_HEADER_TRUNCATION,truncationFlags); } // iterate items for (ArcFileHeaderItem headerItem : responseItem.getHeaderItems()) { // ignore unwanted items if (headerItem.getItemKey().length() != 0) { if (headerItem.getItemValue().length() !=0) { if (!dontProxyHeaders.contains(headerItem.getItemKey().toLowerCase())) { // and send other ones through response.setHeader(headerItem.getItemKey(), headerItem.getItemValue()); } else { if (headerItem.getItemKey().equalsIgnoreCase("content-length")) { response.setHeader(PROXY_HEADER_ORIGINAL_CONTENT_LEN,headerItem.getItemValue()); } } } } } ServletOutputStream responseOutputStream = response.getOutputStream(); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(responseOutputStream,Charset.forName("ASCII"))); // write out content bytes responseOutputStream.write(responseItem.getContent().getReadOnlyBytes(), 0, responseItem.getContent().getCount()); } ProxyServer.getSingleton().logProxySuccess(metadata.getHttpResultCode(), (isS3Response) ? "s3" : "cache", responseItem.getUrl(), responseItem.getFinalURL(), responseObject.getStartTime()); } private static void sendCrawlURLResponse(final HttpServletRequest req, final HttpServletResponse response,CrawlURL url,String renderAs,AsyncResponse responseObject,long requestStartTime)throws IOException { if (url.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) { // remove default headers ... response.setHeader("Date",null); response.setHeader("Server",null); // set the result code ... response.setStatus(200); if (renderAs.equals(PROXY_RENDER_TYPE_TEXT)) { response.setHeader("content-type", "text/plain"); // parse headers ... NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(url.getHeaders()); PrintWriter writer = response.getWriter(); writer.write(PROXY_HEADER_SOURCE+":origin\n"); writer.write(PROXY_HEADER_ORIG_STATUS+":" + headers.getValue(0) + "\n"); writer.write(PROXY_HEADER_TIMER+":" + (System.currentTimeMillis() - requestStartTime) +"MS\n"); writer.write(PROXY_HEADER_FINALURL+":" + (((url.getFlags() & CrawlURL.Flags.IsRedirected) != 0) ? url.getRedirectURL() : url.getUrl()) + "\n"); // and put then in a map ... Map<String,List<String> > headerItems = NIOHttpHeaders.parseHttpHeaders(url.getHeaders()).getHeaders(); writer.write("content-length:" + Integer.toString(url.getContentRaw().getCount()) + "\n"); // pull out content encoding if it is set ... String contentEncoding = headers.findValue("content-encoding"); if (contentEncoding != null) { writer.write("content-encoding:" + contentEncoding + "\n"); } String truncationFlags = ""; if ((url.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0) { truncationFlags += ArcFileItem.Flags.toString(ArcFileItem.Flags.TruncatedInDownload); } if (truncationFlags.length() !=0) { writer.write(PROXY_HEADER_TRUNCATION + ":" + truncationFlags + "\n"); } // now walk remaining headers ... for (Map.Entry<String, List<String>> entry : headerItems.entrySet()) { // if not in exclusion list ... if (entry.getKey() != null && entry.getKey().length() != 0) { if (!dontProxyHeaders.contains(entry.getKey().toLowerCase())) { // and it has values ... if (entry.getValue() != null) { for (String value : entry.getValue()) { writer.write(entry.getKey() + ":" + value + "\n"); } } } else { if (entry.getKey().equalsIgnoreCase("content-length") && entry.getValue() != null) { writer.write(PROXY_HEADER_ORIGINAL_CONTENT_LEN+":"+entry.getValue().get(0)+"\n"); } } } } writer.write("\n"); int contentLength = url.getContentRaw().getCount(); byte contentData[] = url.getContentRaw().getReadOnlyBytes(); if (contentEncoding != null && contentEncoding.equalsIgnoreCase("gzip")) { UnzipResult result = GZIPUtils.unzipBestEffort(contentData,CrawlEnvironment.CONTENT_SIZE_LIMIT); if (result != null) { contentData = result.data.get(); contentLength = result.data.getCount(); } } BufferedReader bufferedReader = readerForCharset(headers, contentData, contentLength, writer); try { String line = null; while ((line = bufferedReader.readLine()) != null) { writer.println(line); } } finally { bufferedReader.close(); } writer.flush(); } else { response.setHeader(PROXY_HEADER_SOURCE,"origin"); response.setHeader(PROXY_HEADER_TIMER,(System.currentTimeMillis() - requestStartTime) +"MS"); response.setHeader(PROXY_HEADER_FINALURL,(((url.getFlags() & CrawlURL.Flags.IsRedirected) != 0) ? url.getRedirectURL() : url.getUrl())); // parse headers ... NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(url.getHeaders()); // and put then in a map ... Map<String,List<String> > headerItems = NIOHttpHeaders.parseHttpHeaders(url.getHeaders()).getHeaders(); // set the content length ... response.setHeader("content-length", Integer.toString(url.getContentRaw().getCount())); String truncationFlags = ""; if ((url.getFlags() & CrawlURL.Flags.TruncatedDuringDownload) != 0) { truncationFlags += ArcFileItem.Flags.toString(ArcFileItem.Flags.TruncatedInDownload); } if (truncationFlags.length() !=0) { response.setHeader(PROXY_HEADER_TRUNCATION,truncationFlags); } // pull out content encoding if it is set ... String contentEncoding = headers.findValue("content-encoding"); if (contentEncoding != null) { response.setHeader("content-encoding", contentEncoding); } // now walk remaining headers ... for (Map.Entry<String, List<String>> entry : headerItems.entrySet()) { // if not in exclusion list ... if (entry.getKey() != null && entry.getKey().length() != 0) { if (!dontProxyHeaders.contains(entry.getKey().toLowerCase())) { // and it has values ... if (entry.getValue() != null) { for (String value : entry.getValue()) { response.setHeader(entry.getKey(),value); } } } else { if (entry.getKey().equalsIgnoreCase("content-length") && entry.getValue() != null) { response.setHeader(PROXY_HEADER_ORIGINAL_CONTENT_LEN,entry.getValue().get(0)); } } } } ServletOutputStream responseOutputStream = response.getOutputStream(); // write out content bytes responseOutputStream.write(url.getContentRaw().getReadOnlyBytes(), 0, url.getContentRaw().getCount()); } } // otherwise failed for some other reason ... else { /* ProxyServer.getSingleton().logProxyFailure(500, CrawlURL.FailureReason.toString(url.getLastAttemptFailureReason()) + " - " + url.getLastAttemptFailureDetail(), url.getUrl(), url.getRedirectURL(), requestStartTime); */ // report the reason ... response.sendError(500, CrawlURL.FailureReason.toString(url.getLastAttemptFailureReason()) + " - " + url.getLastAttemptFailureDetail()); } } private static void queueQueryMasterURLRequest(final String targetURL,final long urlFingerprint,final AsyncResponse responseData,final Semaphore completionSemaphore,final long timeoutInMS,final boolean skipHTTPFetch) { ContentQueryRPCInfo rpcQueryInfo = new ContentQueryRPCInfo(); //TODO:UNFORTUNATE HACK GoogleURL canonicalURL = new GoogleURL(targetURL); rpcQueryInfo.setUrl(canonicalURL.getCanonicalURL()); try { ProxyServer.getSingleton().getQueryMasterStub().doContentQuery(rpcQueryInfo,new AsyncRequest.Callback<ContentQueryRPCInfo,ContentQueryRPCResult> () { @Override public void requestComplete(AsyncRequest<ContentQueryRPCInfo, ContentQueryRPCResult> request) { if (request.getStatus() == AsyncRequest.Status.Success && request.getOutput().getSuccess()) { if (request.getOutput().getArcFileResult().getContent().getCount() == (CrawlEnvironment.ORIGINAL_CONTENT_SIZE_LIMIT + 1)) { LOG.error("RPC to QueryMaster Successfull BUT content size is 131072. Suspecting truncation. REJECTING S3 Data for targetURL:" + targetURL); queueHighPriorityURLRequest(targetURL,urlFingerprint,responseData,completionSemaphore,timeoutInMS,skipHTTPFetch); } else { LOG.info("RPC to QueryMaster Successfull. Servicing request for targetURL:" + targetURL + " via s3 cache"); // cache the http result cacheS3ItemResult(request.getOutput().getArcFileResult(),targetURL,urlFingerprint); // set the result data .. responseData.setS3ItemResponse(request.getOutput().getArcFileResult()); // and set the completion semaphore ... completionSemaphore.release(); } } else { LOG.info("RPC to QueryMaster Failed. Servicing request for targetURL:" + targetURL + " via crawler"); queueHighPriorityURLRequest(targetURL,urlFingerprint,responseData,completionSemaphore,timeoutInMS,skipHTTPFetch); } } }); } catch (RPCException e) { LOG.error("RPC to Query Master for targetURL:" + targetURL + " Failed with Exception:" + CCStringUtils.stringifyException(e)); // queue it up for direct service via crawler ... queueHighPriorityURLRequest(targetURL,urlFingerprint,responseData,completionSemaphore,timeoutInMS,skipHTTPFetch); } } private static void queueHighPriorityURLRequest(final String targetURL,final long urlFingerprint,final AsyncResponse responseData,final Semaphore completionSemaphore,final long timeoutInMS,final boolean skipHTTPFetch) { // first check skip fetch flag ... if (skipHTTPFetch) { // setup an async callback ... ProxyServer.getSingleton().getEventLoop().setTimer(new Timer(0,false, new Timer.Callback() { @Override public void timerFired(Timer timer) { responseData.setHttpErrorResponse(403, "Request Not Found In Cache"); responseData.setCrawlComplete(true); // and set the completion semaphore ... completionSemaphore.release(); } })); return; } // 3. ok time to dispatch this request via the crawler ... ProxyServer.getSingleton().queueHighPriorityURL(targetURL, urlFingerprint, new CrawlItemStatusCallback() { @Override public void crawlComplete(NIOHttpConnection connection,CrawlURL urlObject, CrawlTarget optTargetObj,boolean success) { if (!success) { // set failure code on url .. urlObject.setLastAttemptResult((byte)CrawlURL.CrawlResult.FAILURE); } // cache the http result cacheCrawlURLResult(urlObject,null); // if item was not timed out ... if (!responseData.isCrawlComplete()) { // set the result data .. responseData.setURLItemRespone(urlObject); // and set the completion semaphore ... completionSemaphore.release(); } } @Override public void crawlStarting(CrawlTarget target) { // reset start time to http request start time ... responseData.setStartTime(System.currentTimeMillis()); } }); // and setup a timeout timer ... ProxyServer.getSingleton().getEventLoop().setTimer(new Timer(timeoutInMS,false, new Timer.Callback() { @Override public void timerFired(Timer timer) { // check to see if request is already complete or not if (!responseData.isCrawlComplete()) { responseData.setHttpErrorResponse(500, "Request Timed Out"); responseData.setCrawlComplete(true); // and set the completion semaphore ... completionSemaphore.release(); } } })); } private static boolean checkCacheForURL(final String targetURL,final AsyncResponse responseData,final Semaphore completionSemaphore,final long timeoutInMS,final boolean skipHTTPFetch) { // normalize the url ... try { final String normalizedURL = URLUtils.canonicalizeURL(targetURL,true); final long urlFingerprint = URLFingerprint.generate64BitURLFPrint(normalizedURL); //1. check cache for data ProxyServer.getSingleton().getCache().checkCacheForItem(normalizedURL,urlFingerprint, new CacheItemCheckCallback() { @Override public void cacheItemAvailable(String url, CacheItem item) { // if redirected ... get redirected url ... if ((item.getFlags() & (CacheItem.Flags.Flag_IsPermanentRedirect|CacheItem.Flags.Flag_IsTemporaryRedirect)) != 0) { LOG.info("Redirect Detected for TargetURL:" + targetURL + " Checking Cache for Final URL:" + item.getFinalURL()); // resubmit the request to the cache if (!checkCacheForURL(item.getFinalURL(), responseData, completionSemaphore,timeoutInMS,skipHTTPFetch)) { // immediate failure detected ... responseData.setHttpErrorResponse(400,"Malformed Exception parsing Redirect URL:" + item.getFinalURL()); // release completion semaphore completionSemaphore.release(); } } // otherwise no redirects detected .. else { LOG.info("Servicing Response for URL:" + url + " via cache. Item Content Size is:" + item.getContent().getCount()); // if cached data is available ... // set the appropriate data member in the response object ... // and return to the calling thread (so that it can do the blocking io to service the request) responseData.setCacheItemRespone(item); // release completion semaphore completionSemaphore.release(); } } @Override public void cacheItemNotFound(String url) { // 2. time to hit the query master server (if available) if (false /*ProxyServer.getSingleton().isConnectedToQueryMaster()*/) { LOG.info("Query Master Online. Sending Request:" + targetURL + " to queryMaster"); queueQueryMasterURLRequest(targetURL, urlFingerprint, responseData, completionSemaphore,timeoutInMS,skipHTTPFetch); } else { LOG.info("Query Master Offline. Sending Request:" + targetURL + " directly to crawler"); // otherwise skip and go direct to crawler queue ... queueHighPriorityURLRequest(targetURL, urlFingerprint, responseData, completionSemaphore,timeoutInMS,skipHTTPFetch); } // 2. ok hit the query master if available // } }); // response will complete asynchronously ... return true; } catch(MalformedURLException e){ responseData.setHttpErrorResponse(400,"Malformed Exception parsing URL:" + targetURL); // immdediate response return false; } } private static boolean checkCacheForURLV2(final String targetURL,final AsyncResponse responseData,final Semaphore completionSemaphore,final long timeoutInMS,final boolean skipHTTPFetch) { // normalize the url ... try { final String normalizedURL = URLUtils.canonicalizeURL(targetURL,true); final long urlFingerprint = URLFingerprint.generate64BitURLFPrint(normalizedURL); //1. check cache for data CacheItem item = ProxyServer.getSingleton().getCache().checkCacheForItemInWorkerThread(normalizedURL,urlFingerprint); if (item != null) { // if redirected ... get redirected url ... if ((item.getFlags() & (CacheItem.Flags.Flag_IsPermanentRedirect|CacheItem.Flags.Flag_IsTemporaryRedirect)) != 0) { LOG.info("Redirect Detected for TargetURL:" + targetURL + " Checking Cache for Final URL:" + item.getFinalURL()); // resubmit the request to the cache return checkCacheForURLV2(item.getFinalURL(), responseData, completionSemaphore,timeoutInMS,skipHTTPFetch); } // otherwise no redirects detected .. else { LOG.info("Servicing Response for URL:" + targetURL + " via cache. Item Content Size is:" + item.getContent().getCount()); // if cached data is available ... // set the appropriate data member in the response object ... // and return to the calling thread (so that it can do the blocking io to service the request) responseData.setCacheItemRespone(item); return false; } } else { ProxyServer.getSingleton().getEventLoop().setTimer(new Timer(0,false,new Timer.Callback() { @Override public void timerFired(Timer timer) { LOG.info("Query Master Offline. Sending Request:" + targetURL + " directly to crawler"); // otherwise skip and go direct to crawler queue ... queueHighPriorityURLRequest(targetURL, urlFingerprint, responseData, completionSemaphore,timeoutInMS,skipHTTPFetch); } })); // response will complete asynchronously ... return true; } } catch(MalformedURLException e){ responseData.setHttpErrorResponse(400,"Malformed Exception parsing URL:" + targetURL); } // immdediate response return false; } /* @Override public void doGet(final HttpServletRequest req, final HttpServletResponse response)throws ServletException, IOException { // allocate a response data object ... which will be used by async thread to pass data to calling thread... final AsyncResponse responseData = new AsyncResponse(); final String path = req.getParameter("url"); final String format = (req.getParameter("renderAs") != null) ? req.getParameter("renderAs") : PROXY_RENDER_TYPE_NONE; final String timeoutStr = req.getParameter("timeout"); final String skipHTTPGET = req.getParameter("nocachenodice"); final long desiredTimeOutInMS = (timeoutStr != null) ? Long.parseLong(timeoutStr) : 30000; final boolean skipHTTPGet = (skipHTTPGET != null && skipHTTPGET.equals("1")); LOG.info("Got Request:" + path); final long requestStartTime = System.currentTimeMillis(); AsyncWebServerRequest request = new AsyncWebServerRequest("proxyRequest") { @Override public boolean handleRequest(final Semaphore completionSemaphore)throws IOException { // called within async event thread context ... // so, we have to be careful NOT to do any cpu intensive / blocking operations here !!! LOG.info("Processing Request:" + path); String hostName = (path != null) ? URLUtils.fastGetHostStringFromURL(path): ""; if (path == null || !path.startsWith("http:") || hostName.length() == 0) { LOG.info("URL From Proxy Request:" + path + " is Invalid. Sending 400 Result Code"); responseData.setHttpErrorResponse(400,"URL From Proxy Request:" + path + " is Invalid"); return false; } else { LOG.info("Scheduling Cache Lookup for URL:" + path); checkCacheForURL(path,responseData,completionSemaphore,desiredTimeOutInMS,skipHTTPGet); return true; } } }; // ok this call will block ... request.dispatch(ProxyServer.getSingleton().getEventLoop()); // upon return we need to check the response object ... if (responseData.getResponseType() == AsyncResponse.ResponseType.CacheItemResponse) { // send cache item response ... sendCacheItemResponse(req,response,responseData.getCacheItem(),false,format,responseData,requestStartTime); } else if (responseData.getResponseType() == AsyncResponse.ResponseType.CrawlURLResponse) { sendCrawlURLResponse(req,response,responseData.getCrawlURL(),format,responseData,requestStartTime); } else if (responseData.getResponseType() == AsyncResponse.ResponseType.S3Response) { sendS3ItemResponse(req,response,responseData.getArcFileItem(),format,responseData,requestStartTime); } else { response.sendError(responseData.getHttpErrorCode(),responseData.getHttpErrorDesc()); ProxyServer.getSingleton().logProxyFailure(responseData.getHttpErrorCode(), responseData.getHttpErrorDesc(),path,"",responseData.getStartTime()); } request = null; } */ @Override public void doGet(final HttpServletRequest req, final HttpServletResponse response)throws ServletException, IOException { // allocate a response data object ... which will be used by async thread to pass data to calling thread... final AsyncResponse responseData = new AsyncResponse(); String queryString = req.getQueryString(); final String originalPath = req.getParameter("url"); final String format = (req.getParameter("renderAs") != null) ? req.getParameter("renderAs") : PROXY_RENDER_TYPE_NONE; final String timeoutStr = req.getParameter("timeout"); final String skipHTTPGET = req.getParameter("nocachenodice"); final long desiredTimeOutInMS = (timeoutStr != null) ? Long.parseLong(timeoutStr) : 30000; final boolean skipHTTPGet = (skipHTTPGET != null && skipHTTPGET.equals("1")); final Semaphore semaphore = new Semaphore(0); //LOG.info("Got Request:" + originalPath); final long requestStartTime = System.currentTimeMillis(); //LOG.info("Processing Request:" + originalPath); String hostName = (originalPath != null) ? URLUtils.fastGetHostFromURL(originalPath): ""; String fullPath = null; if (originalPath == null || !originalPath.startsWith("http:") || hostName.length() == 0 || queryString == null) { LOG.info("URL From Proxy Request:" + originalPath + " is Invalid. Sending 400 Result Code"); responseData.setHttpErrorResponse(400,"URL From Proxy Request:" + originalPath + " is Invalid"); } else { // build url path from query string int pathIndex = queryString.indexOf("url="); // grab the whole path ... fullPath = queryString.substring(pathIndex + "url=".length()); // unescape it fullPath = URLDecoder.decode(fullPath,"UTF-8"); //LOG.info("Doing Cache Lookup for URL:" + fullPath); boolean isAsyncOperation = checkCacheForURLV2(fullPath,responseData,semaphore,desiredTimeOutInMS,skipHTTPGet); if (isAsyncOperation) { //LOG.info("Waiting on Async Completion for URL:" + fullPath); semaphore.acquireUninterruptibly(); //LOG.info("Done Waiting for Async Completion for URL:" + fullPath); } } // upon return we need to check the response object ... if (responseData.getResponseType() == AsyncResponse.ResponseType.CacheItemResponse) { // send cache item response ... sendCacheItemResponse(req,response,responseData.getCacheItem(),false,format,responseData,requestStartTime); } else if (responseData.getResponseType() == AsyncResponse.ResponseType.CrawlURLResponse) { sendCrawlURLResponse(req,response,responseData.getCrawlURL(),format,responseData,requestStartTime); } else if (responseData.getResponseType() == AsyncResponse.ResponseType.S3Response) { sendS3ItemResponse(req,response,responseData.getArcFileItem(),format,responseData,requestStartTime); } else { response.sendError(responseData.getHttpErrorCode(),responseData.getHttpErrorDesc()); ProxyServer.getSingleton().logProxyFailure(responseData.getHttpErrorCode(), responseData.getHttpErrorDesc(),fullPath,"",responseData.getStartTime()); } } };