/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.io.IOException; import java.util.Date; import java.util.Iterator; import java.util.Locale; import java.util.StringTokenizer; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.text.SimpleDateFormat; import junit.framework.Assert; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.commoncrawl.io.NIOHttpHeaders; import org.commoncrawl.protocol.CrawlURLMetadata; import org.commoncrawl.util.DateUtils.DateParser; import org.commoncrawl.util.Tuples.Pair; import org.junit.Test; import com.google.common.collect.ImmutableSet; /** * uses http header information to populate http cache related information into the CrawlURLMetadata data structure * * @author rana */ public class HttpHeaderInfoExtractor { private static final Log LOG = LogFactory.getLog(HttpHeaderInfoExtractor.class); public static void parseHeaders(NIOHttpHeaders headers, CrawlURLMetadata metadataInOut)throws IOException { parseStatusLine(headers,metadataInOut); parseContentType(headers, metadataInOut); parseContentLength(headers,metadataInOut); populateETag(headers, metadataInOut); populateAgeValue(headers, metadataInOut); populateDateValue(headers, metadataInOut); populateLastModifiedValue(headers,metadataInOut); populateExpiresValue(headers,metadataInOut); populateCacheControlFlags(headers,metadataInOut); } public static void parseStatusLine(NIOHttpHeaders headers,CrawlURLMetadata metadata) { String responseLine = headers.getValue(0); parseStatusLine(responseLine,metadata); } public static void parseStatusLine(String responseLine,CrawlURLMetadata metadata) { Pair<Integer,Integer> result = parseStatusLine(responseLine); if (result.e1 != 0) metadata.setHttpResponseFlags((byte)result.e1.byteValue()); metadata.setHttpResultCode(result.e0.intValue()); } public static Pair<Integer,Integer> parseStatusLine(String responseLine) { Pair<Integer,Integer> resultOut = new Pair<Integer, Integer>(200,0); if (responseLine == null || responseLine.length() < 4) { resultOut.e1 = CrawlURLMetadata.HTTPResponseFlags.HEADER_MISSING; } else{ responseLine = responseLine.toLowerCase(); if (!responseLine.startsWith("http")) { resultOut.e1 = CrawlURLMetadata.HTTPResponseFlags.HEADER_MISSING; } else{ boolean versionValid = false; if (responseLine.length() > 4 || responseLine.charAt(4) == '/') { int indexOfDot = responseLine.indexOf(".",5); if (indexOfDot != -1 && indexOfDot != 5 || indexOfDot + 1 < responseLine.length()) { char majorVersionChar = responseLine.charAt(5); char minorVersionChar = responseLine.charAt(indexOfDot + 1); if (majorVersionChar >= '0' && majorVersionChar <= '9' && minorVersionChar >= '0' && minorVersionChar <= '9') { int majorVersion = majorVersionChar - '0'; int minorVersion = minorVersionChar - '0'; if (majorVersion == 1 && minorVersion == 0) { resultOut.e1 = CrawlURLMetadata.HTTPResponseFlags.VERSION_1_0; } else if (majorVersion == 1 && minorVersion == 1){ resultOut.e1 = CrawlURLMetadata.HTTPResponseFlags.VERSION_1_1; } else { resultOut.e1 = CrawlURLMetadata.HTTPResponseFlags.VERSION_0_9; } versionValid = true; // now skip past int spaceIndex = responseLine.indexOf(' ',indexOfDot + 1); if (spaceIndex + 1 < responseLine.length()) { int digitStart = spaceIndex + 1; int digitEnd = digitStart; while (digitEnd < responseLine.length()) { char c = responseLine.charAt(digitEnd); if (c >= '0' && c <= '9') ++digitEnd; else break; } if (digitEnd - digitStart != 0) { try { resultOut.e0 = Integer.parseInt(responseLine.substring(digitStart,digitEnd)); } catch (NumberFormatException e) { } } } } } } if (!versionValid) { resultOut.e1 = CrawlURLMetadata.HTTPResponseFlags.VERSION_MISSING; } } } return resultOut; } static void populateETag(NIOHttpHeaders headers,CrawlURLMetadata metadata) { String etagValue = headers.findValue("Etag"); if (etagValue != null) { metadata.setETag(etagValue); } } static void populateAgeValue(NIOHttpHeaders headers,CrawlURLMetadata metadata) { String ageValue = headers.findValue("Age"); if (ageValue != null) { try { long ageInSeconds= Long.parseLong(ageValue); metadata.setAge(ageInSeconds); } catch (NumberFormatException e) { } } } static void populateDateValue(NIOHttpHeaders headers,CrawlURLMetadata metadata) { long timeValue = getTimeHeaderValue("Date", headers); if (timeValue != -1) { metadata.setHttpDate(timeValue); } } static void populateLastModifiedValue(NIOHttpHeaders headers,CrawlURLMetadata metadata) { long timeValue = getTimeHeaderValue("Last-Modified", headers); if (timeValue != -1) { metadata.setLastModifiedTime(timeValue); } } static void populateExpiresValue(NIOHttpHeaders headers,CrawlURLMetadata metadata) { long timeValue = getTimeHeaderValue("Expires", headers); if (timeValue != -1) { metadata.setExpires(timeValue); } } static final String kMaxAgePrefix = "max-age="; static void populateCacheControlFlags(NIOHttpHeaders headers,CrawlURLMetadata metadataInOut)throws IOException { Iterator<String> i = headers.multiValueIterator("cache-control"); while (i.hasNext()) { String ccValue = i.next(); StringTokenizer tokenizer = new StringTokenizer(ccValue,","); while (tokenizer.hasMoreElements()) { String value = tokenizer.nextToken(); if (value.equals("no-cache")) { metadataInOut.setCacheControlFlags((byte)( metadataInOut.getCacheControlFlags() | CrawlURLMetadata.CacheControlFlags.NO_CACHE)); } else if (value.equals("no-store")) { metadataInOut.setCacheControlFlags((byte)( metadataInOut.getCacheControlFlags() | CrawlURLMetadata.CacheControlFlags.NO_STORE)); } else if (value.equals("must-revalidate")) { metadataInOut.setCacheControlFlags((byte)( metadataInOut.getCacheControlFlags() | CrawlURLMetadata.CacheControlFlags.NO_STORE)); } else if (value.equals("private")) { metadataInOut.setCacheControlFlags((byte)( metadataInOut.getCacheControlFlags() | CrawlURLMetadata.CacheControlFlags.PRIVATE)); } else { if (value.length() > kMaxAgePrefix.length()) { String valueLowerCase = value.toLowerCase(); if (valueLowerCase.startsWith(kMaxAgePrefix)) { try { long maxAgeInSeconds = Long.parseLong(value.substring(kMaxAgePrefix.length())); metadataInOut.setMaxAge(maxAgeInSeconds); } catch (NumberFormatException e) { } } } } } } Iterator<String> j = headers.multiValueIterator("pragma"); while (j.hasNext()) { String value = j.next(); if (value.equals("no-cache")) { metadataInOut.setCacheControlFlags((byte)( metadataInOut.getCacheControlFlags() | CrawlURLMetadata.CacheControlFlags.NO_CACHE)); } } String varyValue = headers.findValue("vary"); if (varyValue != null && varyValue.equals("*")) { metadataInOut.setCacheControlFlags((byte)( metadataInOut.getCacheControlFlags() | CrawlURLMetadata.CacheControlFlags.VARY)); } } static void parseContentLength(NIOHttpHeaders headers,CrawlURLMetadata metadata) { String contentLenValue = headers.findValue("Content-Length"); if (contentLenValue != null) { try { metadata.setHttpContentLength(Integer.parseInt(contentLenValue)); } catch (Exception e) { } } } static void parseContentType(NIOHttpHeaders headers,CrawlURLMetadata metadata) { Iterator<String> j = headers.multiValueIterator("content-type"); while (j.hasNext()) { String contentType = j.next(); if (contentType != null) { parseContentType(metadata,contentType); } } } public static final void parseContentType(CrawlURLMetadata metadataOut,String contentType) { // Trim leading and trailing whitespace from type. We include '(' in // the trailing trim set to catch media-type comments, which are not at all // standard, but may occur in rare cases. int type_val = HttpHeaderUtils.skipPastLWS(contentType,0); type_val = Math.min(type_val,contentType.length()); int type_end = HttpHeaderUtils.skipToLWSAndExtra(contentType, type_val); if (type_end == -1) type_end = contentType.length(); int charset_val = 0; int charset_end = 0; // Iterate over parameters boolean type_has_charset = false; int param_start = contentType.indexOf(';', type_end); if (param_start != -1) { // We have parameters. Iterate over them. int cur_param_start = param_start + 1; do { int cur_param_end = contentType.indexOf(';',cur_param_start); if (cur_param_end == -1) cur_param_end = contentType.length(); int param_name_start = HttpHeaderUtils.skipPastLWS(contentType,cur_param_start); param_name_start = Math.min(param_name_start, cur_param_end); int charset_end_offset = Math.min(param_name_start + HttpHeaderUtils.kCharset.length(), cur_param_end); if (contentType.substring(param_name_start,charset_end_offset).equalsIgnoreCase(HttpHeaderUtils.kCharset)) { charset_val = param_name_start + HttpHeaderUtils.kCharset.length(); charset_end = cur_param_end; type_has_charset = true; } cur_param_start = cur_param_end + 1; } while (cur_param_start < contentType.length()); } if (type_has_charset) { try { // Trim leading and trailing whitespace from charset_val. We include // '(' in the trailing trim set to catch media-type comments, which are // not at all standard, but may occur in rare cases. charset_val = HttpHeaderUtils.skipPastLWS(contentType,charset_val); charset_val = Math.min(charset_val, charset_end); if (charset_val == contentType.length()) { type_has_charset = false; } else { char first_char = contentType.charAt(charset_val); if (first_char == '"' || first_char == '\'') { ++charset_val; charset_end = contentType.indexOf(first_char,charset_val); if (charset_end == -1) charset_end = HttpHeaderUtils.skipToLWSAndExtra(contentType,charset_val); } else { charset_end = Math.min(HttpHeaderUtils.skipToLWSAndExtra(contentType,charset_val),charset_end); } } } catch (IndexOutOfBoundsException e) { type_has_charset = false; } } // if the server sent "*/*", it is meaningless, so do not store it. // also, if type_val is the same as mime_type, then just update the // charset. however, if charset is empty and mime_type hasn't // changed, then don't wipe-out an existing charset. We // also want to reject a mime-type if it does not include a slash. // some servers give junk after the charset parameter, which may // include a comma, so this check makes us a bit more tolerant. if (contentType.length() != 0 && !contentType.equals("*/*") && contentType.indexOf('/') != -1) { String originalContentType = metadataOut.getContentType(); metadataOut.setContentType(contentType.substring(type_val,type_end).toLowerCase()); if (type_has_charset) { metadataOut.setCharset(contentType.substring(charset_val,charset_end).toLowerCase()); } else { if (metadataOut.getCharset().length() != 0 && !originalContentType.equals(metadataOut.getContentType())) { metadataOut.setCharset(""); metadataOut.setFieldClean(CrawlURLMetadata.Field_CHARSET); } } } } private static long getTimeHeaderValue(String keyName,NIOHttpHeaders headers) { String value = headers.findValue(keyName); if (value != null) { return getTime(value); } return -1; } static String _datePatterns[] = new String [] { "EEE, dd-MMM-yyyy HH:mm:ss zzz", "EEE MMM dd HH:mm:ss yyyy", "EEE MMM dd HH:mm:ss yyyy zzz", "EEE, MMM dd HH:mm:ss yyyy zzz", "EEE, dd MMM yyyy HH:mm:ss zzz", "EEE,dd MMM yyyy HH:mm:ss zzz", "EEE, dd MMM yyyy HH:mm:sszzz", "EEE, dd MMM yyyy HH:mm:ss", "EEE, dd-MMM-yy HH:mm:ss zzz", "EEE, dd-MMM-yy zzz", "EEE, dd MMM yyyy zzz", "EEE MMM dd yyyy zzz", "EEE, dd MMM yyyy HH:mm zzz", "yyyy/MM/dd HH:mm:ss.SSS zzz", "yyyy/MM/dd HH:mm:ss.SSS", "yyyy/MM/dd HH:mm:ss zzz", "yyyy/MM/dd", "yyyy.MM.dd HH:mm:ss", "yyyy.MM.dd", "yyyy-MM-dd HH:mm", "yyyy-MM-dd HH:mm:ss", "MMM dd yyyy HH:mm:ss. zzz", "MMM dd yyyy HH:mm:ss zzz", "dd.MM.yyyy HH:mm:ss zzz", "dd MM yyyy HH:mm:ss zzz", "dd.MM.yyyy; HH:mm:ss", "dd.MM.yyyy HH:mm:ss", "dd.MM.yyyy zzz", "dd.MM.yyyy", "dd/MM/yyyy hh:mm:ss aa zzz", "dd/MM/yyyy hh:mm:ss aa", "dd/MM/yyyy HH:mm:ss zzz", "dd/MM/yyyy HH:mm:ss", "dd.MM.yyyy zzz" }; static ThreadLocal<DateParser> _dateParser = new ThreadLocal<DateParser>() { protected DateParser initialValue() { return new DateParser(_datePatterns); }; }; static ThreadLocal<SimpleDateFormat> _httpDateParser = new ThreadLocal<SimpleDateFormat>() { @Override protected SimpleDateFormat initialValue() { return new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US); } }; static ImmutableSet<String> badDatePatterns = new ImmutableSet.Builder<String>() .add("-1") .add("0") .add("GMT") .add("now") .add("Now()") .build(); static Pattern onlyDigits = Pattern.compile("[0-9]*"); static Pattern specialTSMatcher = Pattern.compile("\\{\\s*ts\\s*'([0-9]{4})-([0-9]{2})-([0-9]{2}) ([0-9]{2}):([0-9]{2}):([0-9]{2})'\\s*\\}"); @SuppressWarnings("deprecation") public static long getTime(String date) { long time = -1; if (date != null) { date = date.trim(); if (date.length() != 0 && !badDatePatterns.contains(date)) { try { try { if (onlyDigits.matcher(date).matches()) { time = Long.parseLong(date); } Matcher specialTS = specialTSMatcher.matcher(date); if (specialTS.matches()) { time = new Date( Integer.parseInt(specialTS.group(1)), // year Integer.parseInt(specialTS.group(2)), // month Integer.parseInt(specialTS.group(3)), // day Integer.parseInt(specialTS.group(4)), // hr Integer.parseInt(specialTS.group(5)), // min Integer.parseInt(specialTS.group(6))).getTime(); // ss } } catch (Exception e) { } if (time == -1 ) { long timeStartForSimpleDateFormatParse = System.currentTimeMillis(); time = _httpDateParser.get().parse(date).getTime(); long timeEndForSimpleDateFormatParse = System.currentTimeMillis(); // LOG.info("#### Date Parse (MostCommon) Took:" + (timeEndForSimpleDateFormatParse - timeStartForSimpleDateFormatParse)); } } catch (Exception e) { time = DateUtils.parseHttpDate(date); if (time == -1) { // try to parse it as date in alternative format try { long timeStartForNewParser = System.currentTimeMillis(); Date parsedDate = _dateParser.get().parseDate(date); long timeEndForNewPaser = System.currentTimeMillis(); // LOG.info("#### Date Parse (New) Took:" + (timeEndForNewPaser - timeStartForNewParser)); time = parsedDate.getTime(); // if (LOG.isWarnEnabled()) { // LOG.warn(url + ": parsed date: " + date +" to:"+time); // } } catch (Exception e2) { LOG.error("can't parse erroneous date: " + date); } } } } } return time; } @Test public void validateParser() throws Exception { validateCacheControlParser(); validateContentTypeParser(); } private void validateContentTypeParser() throws Exception { String sampleHeaders[] = { "HTTP/1.1 200 OK\n" + "Content-type: text/html\n", "text/html", "", // Multiple content-type headers should give us the last one. "HTTP/1.1 200 OK\n" + "Content-type: text/html\n" + "Content-type: text/html\n", "text/html", "", "HTTP/1.1 200 OK\n" + "Content-type: text/plain\n" + "Content-type: text/html\n" + "Content-type: text/plain\n" + "Content-type: text/html\n", "text/html", "", // Test charset parsing. "HTTP/1.1 200 OK\n" + "Content-type: text/html\n" + "Content-type: text/html; charset=ISO-8859-1\n", "text/html", "iso-8859-1", // Test charset in double quotes. "HTTP/1.1 200 OK\n" + "Content-type: text/html\n" + "Content-type: text/html; charset=\"ISO-8859-1\"\n", "text/html", "iso-8859-1", // If there are multiple matching content-type headers, we carry // over the charset value. "HTTP/1.1 200 OK\n" + "Content-type: text/html;charset=utf-8\n" + "Content-type: text/html\n", "text/html", "utf-8", // Test single quotes. "HTTP/1.1 200 OK\n" + "Content-type: text/html;charset='utf-8'\n" + "Content-type: text/html\n", "text/html", "utf-8", // Last charset wins if matching content-type. "HTTP/1.1 200 OK\n" + "Content-type: text/html;charset=utf-8\n" + "Content-type: text/html;charset=iso-8859-1\n", "text/html", "iso-8859-1", // Charset is ignored if the content types change. "HTTP/1.1 200 OK\n" + "Content-type: text/plain;charset=utf-8\n" + "Content-type: text/html\n", "text/html", "", // Empty content-type "HTTP/1.1 200 OK\n" + "Content-type: \n", "", "", // Emtpy charset "HTTP/1.1 200 OK\n" + "Content-type: text/html;charset=\n", "text/html", "", // Multiple charsets, last one wins. "HTTP/1.1 200 OK\n" + "Content-type: text/html;charset=utf-8; charset=iso-8859-1\n", "text/html", "iso-8859-1", // Multiple params. "HTTP/1.1 200 OK\n" + "Content-type: text/html; foo=utf-8; charset=iso-8859-1\n", "text/html", "iso-8859-1", "HTTP/1.1 200 OK\n" + "Content-type: text/html ; charset=utf-8 ; bar=iso-8859-1\n", "text/html", "utf-8", // Comma embeded in quotes. "HTTP/1.1 200 OK\n" + "Content-type: text/html ; charset='utf-8,text/plain' ;\n", "text/html", "utf-8,text/plain", // Charset with leading spaces. "HTTP/1.1 200 OK\n" + "Content-type: text/html ; charset= 'utf-8' ;\n", "text/html", "utf-8", // Media type comments in mime-type. "HTTP/1.1 200 OK\n" + "Content-type: text/html (html)\n", "text/html", "", // Incomplete charset= param "HTTP/1.1 200 OK\n" + "Content-type: text/html; char=\n", "text/html", "", // Invalid media type: no slash "HTTP/1.1 200 OK\n" + "Content-type: texthtml\n", "", "", // Invalid media type: */* "HTTP/1.1 200 OK\n" + "Content-type: */*\n", "", "" }; int testCount = sampleHeaders.length / 3; for (int i=0;i<testCount;++i) { String header = sampleHeaders[i*3]; String expectedContentType = sampleHeaders[(i*3) + 1]; String expectedCharsetType = sampleHeaders[(i*3) + 2]; NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(header); CrawlURLMetadata metadata = new CrawlURLMetadata(); System.out.println("****Original Header:" + header); System.out.println("Exepcted ContentType:" + expectedContentType); System.out.println("Exepcted Charset:" + expectedCharsetType); System.out.println("****Parsed Results:"); parseContentType(headers,metadata); if (metadata.isFieldDirty(CrawlURLMetadata.Field_CONTENTTYPE)) { System.out.println("ContentType:" + metadata.getContentType()); Assert.assertTrue(expectedContentType.length() == metadata.getContentType().length()); if (expectedContentType.length() != 0) { Assert.assertTrue(expectedContentType.equals(metadata.getContentType())); } } else { Assert.assertTrue(expectedContentType.length() == 0); } if (metadata.isFieldDirty(CrawlURLMetadata.Field_CHARSET)) { System.out.println("Charset:" + metadata.getCharset()); Assert.assertTrue(expectedCharsetType.length() == metadata.getCharset().length()); if (expectedCharsetType.length() != 0) { Assert.assertTrue(expectedCharsetType.equals(metadata.getCharset())); } } else { Assert.assertTrue(expectedCharsetType.length() == 0); } } } private void validateCacheControlParser() throws Exception { String sampleHeaders[] = { "HTTP/1.1 200 OK\n" + "Etag: \"34534-d3 134q\"\n" + "\n", // valid for a little while "HTTP/1.1 200 OK\n" + "cache-control: max-age=10000\n" + "\n", // expires in the future "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 28 Nov 2007 01:00:00 GMT\n" + "\n", // expired already "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 28 Nov 2007 00:00:00 GMT\n" + "\n", // max-age trumps expires "HTTP/1.1 200 OK\n" + "HTTP/1.1 200 OK\n" + "\n", // valid for a little while "HTTP/1.1 200 OK\n" + "cache-control: max-age=10000\n" + "\n", // expires in the future "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 28 Nov 2007 01:00:00 GMT\n" + "\n", // expired already "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 28 Nov 2007 00:00:00 GMT\n" + "\n", // max-age trumps expires "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 28 Nov 2007 00:00:00 GMT\n" + "cache-control: max-age=10000\n" + "\n", // last-modified heuristic: modified a while ago "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "last-modified: Wed, 27 Nov 2007 08:00:00 GMT\n" + "\n", // last-modified heuristic: modified recently "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "last-modified: Wed, 28 Nov 2007 00:40:10 GMT\n" + "\n", // cached permanent redirect "HTTP/1.1 301 Moved Permanently\n" + "\n", // cached redirect: not reusable even though by default it would be "HTTP/1.1 300 Multiple Choices\n" + "Cache-Control: no-cache\n" + "\n", // cached forever by default "HTTP/1.1 410 Gone\n" + "\n", // cached temporary redirect: not reusable "HTTP/1.1 302 Found\n" + "\n", // cached temporary redirect: reusable "HTTP/1.1 302 Found\n" + "cache-control: max-age=10000\n" + "\n", // cache-control: max-age=N overrides expires: date in the past "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 28 Nov 2007 00:20:11 GMT\n" + "cache-control: max-age=10000\n" + "\n", // cache-control: no-store overrides expires: in the future "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 29 Nov 2007 00:40:11 GMT\n" + "cache-control: no-store,private,no-cache=\"foo\"\n" + "\n", // pragma: no-cache overrides last-modified heuristic "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "last-modified: Wed, 27 Nov 2007 08:00:00 GMT\n" + "pragma: no-cache\n" + "\n", "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 28 Nov 2007 00:00:00 GMT\n" + "cache-control: max-age=10000\n" + "\n", // last-modified heuristic: modified a while ago "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "last-modified: Wed, 27 Nov 2007 08:00:00 GMT\n" + "\n", // last-modified heuristic: modified recently "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "last-modified: Wed, 28 Nov 2007 00:40:10 GMT\n" + "\n", // cached permanent redirect "HTTP/1.1 301 Moved Permanently\n" + "\n", // cached redirect: not reusable even though by default it would be "HTTP/1.1 300 Multiple Choices\n" + "Cache-Control: no-cache\n" + "\n", // cached forever by default "HTTP/1.1 410 Gone\n" + "\n", // cached temporary redirect: not reusable "HTTP/1.1 302 Found\n" + "\n", // cached temporary redirect: reusable "HTTP/1.1 302 Found\n" + "cache-control: max-age=10000\n" + "\n", // cache-control: max-age=N overrides expires: date in the past "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 28 Nov 2007 00:20:11 GMT\n" + "cache-control: max-age=10000\n" + "\n", // cache-control: no-store overrides expires: in the future "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "expires: Wed, 29 Nov 2007 00:40:11 GMT\n" + "cache-control: no-store,private,no-cache=\"foo\"\n" + "\n", // pragma: no-cache overrides last-modified heuristic "HTTP/1.1 200 OK\n" + "date: Wed, 28 Nov 2007 00:40:11 GMT\n" + "last-modified: Wed, 27 Nov 2007 08:00:00 GMT\n" + "pragma: no-cache\n" + "\n" }; for (String header : sampleHeaders) { NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(header); CrawlURLMetadata metadata = new CrawlURLMetadata(); System.out.println("****Original Header:" + header); System.out.println("****Parsed Results:"); try { parseHeaders(headers,metadata); if (metadata.isFieldDirty(CrawlURLMetadata.Field_HTTPRESPONSEFLAGS)) { StringBuffer buffer = new StringBuffer(); buffer.append("ResponseFlags:"); if ((metadata.getHttpResponseFlags() & CrawlURLMetadata.HTTPResponseFlags.HEADER_MISSING) != 0) { buffer.append(",HeaderMissing"); } if ((metadata.getHttpResponseFlags() & CrawlURLMetadata.HTTPResponseFlags.VERSION_MISSING) != 0) { buffer.append(",VersionMissing"); } if ((metadata.getHttpResponseFlags() & CrawlURLMetadata.HTTPResponseFlags.VERSION_0_9) != 0) { buffer.append(",Version0.9"); } if ((metadata.getHttpResponseFlags() & CrawlURLMetadata.HTTPResponseFlags.VERSION_1_0) != 0) { buffer.append(",Version1.0"); } if ((metadata.getHttpResponseFlags() & CrawlURLMetadata.HTTPResponseFlags.VERSION_1_1) != 0) { buffer.append(",Version1.1"); } System.out.println(buffer.toString()); } if (metadata.isFieldDirty(CrawlURLMetadata.Field_HTTPRESULTCODE)) { System.out.println("HttpResultCode:" + metadata.getHttpResultCode()); } if (metadata.isFieldDirty(CrawlURLMetadata.Field_ETAG)) { System.out.println("ETag:" + metadata.getETag()); } if (metadata.isFieldDirty(CrawlURLMetadata.Field_AGE)) { System.out.println("Age:" + metadata.getAge()); } if (metadata.isFieldDirty(CrawlURLMetadata.Field_HTTPDATE)) { System.out.println("Date:" + metadata.getHttpDate()); } if (metadata.isFieldDirty(CrawlURLMetadata.Field_LASTMODIFIEDTIME)) { System.out.println("Last-Modified:" + metadata.getLastModifiedTime()); } if (metadata.isFieldDirty(CrawlURLMetadata.Field_EXPIRES)) { System.out.println("Expires:" + metadata.getExpires()); } if (metadata.isFieldDirty(CrawlURLMetadata.Field_MAXAGE)) { System.out.println("MaxAge:" + metadata.getMaxAge()); } if (metadata.isFieldDirty(CrawlURLMetadata.Field_CACHECONTROLFLAGS)) { StringBuffer buffer = new StringBuffer(); buffer.append("CacheControl:"); if ((metadata.getCacheControlFlags() & CrawlURLMetadata.CacheControlFlags.NO_CACHE) != 0) buffer.append("no-cache,"); if ((metadata.getCacheControlFlags() & CrawlURLMetadata.CacheControlFlags.NO_STORE) != 0) buffer.append("no-store,"); if ((metadata.getCacheControlFlags() & CrawlURLMetadata.CacheControlFlags.MUST_REVALIDATE) != 0) buffer.append("must-revalidate,"); if ((metadata.getCacheControlFlags() & CrawlURLMetadata.CacheControlFlags.VARY) != 0) buffer.append("vary,"); if ((metadata.getCacheControlFlags() & CrawlURLMetadata.CacheControlFlags.PRIVATE) != 0) buffer.append("private,"); System.out.println(buffer.toString()); } } catch (IOException e) { System.out.println(CCStringUtils.stringifyException(e)); } } } }