/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.io.IOException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.commoncrawl.io.NIOHttpHeaders; import org.commoncrawl.protocol.CrawlURLMetadata; import org.junit.Test; /** * * @author rana * */ public class HttpCacheUtils { static final long kDefaultExpireTime = 86400000L * 7; private static final Log LOG = LogFactory.getLog(HttpCacheUtils.class); public static boolean isCacheable(CrawlURLMetadata metadata) { if ((((int)metadata.getCacheControlFlags()) & (CrawlURLMetadata.CacheControlFlags.NO_CACHE | CrawlURLMetadata.CacheControlFlags.NO_STORE | CrawlURLMetadata.CacheControlFlags.VARY)) != 0) { return false; } return true; } public static class LifeTimeInfo { enum Source { CacheControl, MaxAge, Expires, LastModifiedTime, CurrentTime, PermanentRedirect, NoCacheMetadata } public long _lifetime = 0; public Source _source; } public static LifeTimeInfo getFreshnessLifetimeInMilliseconds(CrawlURLMetadata metadata) { LifeTimeInfo infoOut = new LifeTimeInfo(); if ((((int)metadata.getCacheControlFlags()) & (CrawlURLMetadata.CacheControlFlags.NO_CACHE | CrawlURLMetadata.CacheControlFlags.NO_STORE | CrawlURLMetadata.CacheControlFlags.VARY)) != 0) { //LOG.info("#### CACHE GetFreshness - Found no-cache or no-store or vary. Freshness Lifetime = 0"); infoOut._source = LifeTimeInfo.Source.CacheControl; return infoOut; } if (metadata.isFieldDirty(CrawlURLMetadata.Field_MAXAGE)) { //LOG.info("#### CACHE GetFreshness - Found max-age of:" + metadata.getMaxAge()); // return max-age in milliseconds infoOut._lifetime = (metadata.getMaxAge() * 1000); infoOut._source = LifeTimeInfo.Source.MaxAge; return infoOut; } // figure out fetch time to use ... either date or actual fetch time long fetchTime = 0; if (metadata.isFieldDirty(CrawlURLMetadata.Field_HTTPDATE)) { fetchTime = metadata.getHttpDate(); } else if (metadata.isFieldDirty(CrawlURLMetadata.Field_LASTFETCHTIMESTAMP)) { fetchTime = metadata.getLastFetchTimestamp(); } // otherwise , if no max-age ... see if expires is present if (metadata.isFieldDirty(CrawlURLMetadata.Field_EXPIRES)) { infoOut._lifetime = Math.max(0,metadata.getExpires() - fetchTime); infoOut._source = LifeTimeInfo.Source.Expires; return infoOut; } // otherwise ... if http 200 ... if (metadata.getHttpResultCode() == 200 || metadata.getHttpResultCode() == 203) { // and cache control does not specify must-revalidate ... if ((((int)metadata.getCacheControlFlags()) & CrawlURLMetadata.CacheControlFlags.MUST_REVALIDATE) == 0) { long timeRemainingBeforeValidate = 0; // if last_modified is present ... if (metadata.isFieldDirty(CrawlURLMetadata.Field_LASTMODIFIEDTIME) && metadata.getLastModifiedTime() <= fetchTime) { //LOG.info("#### CACHE GetFreshness - returning expireTime - (fetchTime - lastModifiedTime):" + Math.max(0,kDefaultExpireTime - (fetchTime - metadata.getLastModifiedTime()))); infoOut._lifetime = Math.max(0,kDefaultExpireTime - (fetchTime - metadata.getLastModifiedTime())); infoOut._source = LifeTimeInfo.Source.LastModifiedTime; } if (infoOut._lifetime == 0) { // use fetch time as last modified time ... //LOG.info("#### CACHE GetFreshness - returning expireTime - (currentTime - lastModifiedTime):" + Math.max(0,kDefaultExpireTime - (System.currentTimeMillis() - fetchTime))); infoOut._lifetime = Math.max(0,kDefaultExpireTime - (System.currentTimeMillis() - fetchTime)); infoOut._source = LifeTimeInfo.Source.CurrentTime; } return infoOut; } } if (metadata.getHttpResultCode() == 300 || metadata.getHttpResultCode() == 301 || metadata.getHttpResultCode() == 410) { infoOut._lifetime = Long.MAX_VALUE; infoOut._source = LifeTimeInfo.Source.PermanentRedirect; return infoOut; } infoOut._source = LifeTimeInfo.Source.NoCacheMetadata; return infoOut; } public static long getCurrentAgeInMilliseconds(CrawlURLMetadata metadata) { // If there is no Date header, then assume that the server response was // generated at the time when we received the response. long date_value = metadata.getLastFetchTimestamp(); if (metadata.isFieldDirty(CrawlURLMetadata.Field_HTTPDATE)) { date_value = metadata.getHttpDate(); } // If there is no Age header, then assume age is zero. GetAgeValue does not // modify its out param if the value does not exist. long age_value = metadata.getAge() * 1000; long apparent_age = Math.max(0, metadata.getLastFetchTimestamp() - date_value); long corrected_received_age = Math.max(apparent_age, age_value); long resident_time = System.currentTimeMillis() - metadata.getLastFetchTimestamp(); long current_age = corrected_received_age + resident_time; return current_age; } public static boolean requiresValidation(CrawlURLMetadata metadata) { LifeTimeInfo lifetimeInfo = getFreshnessLifetimeInMilliseconds(metadata); if (lifetimeInfo._lifetime == 0){ //LOG.info("#### CACHE requiresValidation - (getFreshnessLifetime returned zero) - YES"); return true; } else { //LOG.info("#### CACHE requiresValidation - (lifetime <= getCurrentAgeInMilliseconds(metadata)) -" +(lifetime <= getCurrentAgeInMilliseconds(metadata))); return (lifetimeInfo._lifetime <= getCurrentAgeInMilliseconds(metadata)); } } @Test public void TestCacheUtils() throws Exception { validateCacheControlParser(); } private void validateCacheControlParser() throws Exception { String sampleHeaders[] = { // last-modified heuristic: modified a while ago "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "last-modified: Wed, 27 Nov 2007 08:00:00 GMT\n" + "\n", "HTTP/1.1 200 OK\n" + "Etag: \"34534-d3 134q\"\n" + "\n", // valid for a little while "HTTP/1.1 200 OK\n" + "cache-control: max-age=10000\n" + "\n", // expires in the future "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 28 Nov 2007 01:00:00 GMT\n" + "\n", // expired already "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 28 Nov 2007 00:00:00 GMT\n" + "\n", // max-age trumps expires "HTTP/1.1 200 OK\n" + "HTTP/1.1 200 OK\n" + "\n", // valid for a little while "HTTP/1.1 200 OK\n" + "cache-control: max-age=10000\n" + "\n", // expires in the future "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 28 Nov 2007 01:00:00 GMT\n" + "\n", // expired already "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 28 Nov 2007 00:00:00 GMT\n" + "\n", // max-age trumps expires "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 28 Nov 2007 00:00:00 GMT\n" + "cache-control: max-age=10000\n" + "\n", // last-modified heuristic: modified recently "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "last-modified: Wed, 28 Nov 2007 00:40:10 GMT\n" + "\n", // cached permanent redirect "HTTP/1.1 301 Moved Permanently\n" + "\n", // cached redirect: not reusable even though by default it would be "HTTP/1.1 300 Multiple Choices\n" + "Cache-Control: no-cache\n" + "\n", // cached forever by default "HTTP/1.1 410 Gone\n" + "\n", // cached temporary redirect: not reusable "HTTP/1.1 302 Found\n" + "\n", // cached temporary redirect: reusable "HTTP/1.1 302 Found\n" + "cache-control: max-age=10000\n" + "\n", // cache-control: max-age=N overrides expires: date in the past "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 28 Nov 2007 00:20:11 GMT\n" + "cache-control: max-age=10000\n" + "\n", // cache-control: no-store overrides expires: in the future "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 29 Nov 2007 00:40:11 GMT\n" + "cache-control: no-store,private,no-cache=\"foo\"\n" + "\n", // pragma: no-cache overrides last-modified heuristic "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "last-modified: Wed, 27 Nov 2007 08:00:00 GMT\n" + "pragma: no-cache\n" + "\n", "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 28 Nov 2007 00:00:00 GMT\n" + "cache-control: max-age=10000\n" + "\n", // last-modified heuristic: modified a while ago "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "last-modified: Wed, 27 Nov 2007 08:00:00 GMT\n" + "\n", // last-modified heuristic: modified recently "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "last-modified: Wed, 28 Nov 2007 00:40:10 GMT\n" + "\n", // cached permanent redirect "HTTP/1.1 301 Moved Permanently\n" + "\n", // cached redirect: not reusable even though by default it would be "HTTP/1.1 300 Multiple Choices\n" + "Cache-Control: no-cache\n" + "\n", // cached forever by default "HTTP/1.1 410 Gone\n" + "\n", // cached temporary redirect: not reusable "HTTP/1.1 302 Found\n" + "\n", // cached temporary redirect: reusable "HTTP/1.1 302 Found\n" + "cache-control: max-age=10000\n" + "\n", // cache-control: max-age=N overrides expires: date in the past "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 28 Nov 2007 00:20:11 GMT\n" + "cache-control: max-age=10000\n" + "\n", // cache-control: no-store overrides expires: in the future "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 29 Nov 2007 00:40:11 GMT\n" + "cache-control: no-store,private,no-cache=\"foo\"\n" + "\n", // pragma: no-cache overrides last-modified heuristic "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "last-modified: Wed, 27 Nov 2007 08:00:00 GMT\n" + "pragma: no-cache\n" + "\n" }; for (String header : sampleHeaders) { NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(header); CrawlURLMetadata metadata = new CrawlURLMetadata(); System.out.println("****Original Header:" + header); System.out.println("****Parsed Results:"); try { HttpHeaderInfoExtractor.parseHeaders(headers,metadata); // set fetch time metadata.setLastFetchTimestamp(System.currentTimeMillis()); LifeTimeInfo freshnessLifetime = getFreshnessLifetimeInMilliseconds(metadata); long currentAge = getCurrentAgeInMilliseconds(metadata); boolean requiresValidation = requiresValidation(metadata); System.out.println("Freshness:" + freshnessLifetime._lifetime + " CurrentAge:" + currentAge + " RequiresRevalidation:" + requiresValidation); } catch (IOException e) { } } } }