/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.util; import java.text.ParseException; import java.text.ParsePosition; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import java.util.StringTokenizer; import java.util.TimeZone; import java.util.regex.Matcher; import java.util.regex.Pattern; import junit.framework.Assert; /** * Pseudo flexible HTTP date parser * * @author rana * */ public class DateUtils { public static class DateParser { SimpleDateFormat parsers[] = null; public DateParser(String[] patterns) { parsers = new SimpleDateFormat[patterns.length]; int index = 0; for (String pattern : patterns) { parsers[index++] = new SimpleDateFormat(pattern); } } public Date parseDate(String str) throws ParseException { if (str == null) { throw new IllegalArgumentException("Date and Patterns must not be null"); } ParsePosition pos = new ParsePosition(0); for (SimpleDateFormat parser : parsers) { Date date = parser.parse(str, pos); pos.setIndex(0); if (date != null && pos.getIndex() == str.length()) { return date; } } throw new ParseException("Unable to parse the date: " + str, -1); } } static String kMonths[] = { "jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec" }; static String kDelimiters = "\t !\"#$%&'()*+,-./;<=>?@[\\]^_`{|}~"; static class TimeExploded { int year; // Four digit year "2007" int month; // 1-based month (values 1 = January, etc.) int day_of_week; // 0-based day of week (0 = Sunday, etc.) int day_of_month; // 1-based day of month (1-31) int hour; // Hour within the current day (0-23) int minute; // Minute within the current hour (0-59) int second; // Second within the current minute (0-59 plus leap // seconds which may take it up to 60). int millisecond; // Milliseconds within the current second (0-999) } static Pattern timePattern = Pattern.compile("(\\d+):(\\d+):(\\d+).*"); static boolean isASCIIDigit(char c) { return (c >= '0') & (c <= '9'); } // Parse a cookie expiration time. We try to be lenient, but we need to // assume some order to distinguish the fields. The basic rules: // - The month name must be present and prefix the first 3 letters of the // full month name (jan for January, jun for June). // - If the year is <= 2 digits, it must occur after the day of month. // - The time must be of the format hh:mm:ss. // An average cookie expiration will look something like this: // Sat, 15-Apr-17 21:01:22 GMT public static long parseHttpDate(String time_string) { int kMonthsLen = kMonths.length; // We want to be pretty liberal, and support most non-ascii and non-digit // characters as a delimiter. We can't treat : as a delimiter, because it // is the delimiter for hh:mm:ss, and we want to keep this field together. // We make sure to include - and +, since they could prefix numbers. // If the cookie attribute came in in quotes (ex expires="XXX"), the quotes // will be preserved, and we will get them here. So we make sure to include // quote characters, and also \ for anything that was internally escaped. TimeExploded exploded = new TimeExploded(); StringTokenizer tokenizer = new StringTokenizer(time_string, kDelimiters); boolean found_day_of_month = false; boolean found_month = false; boolean found_time = false; boolean found_year = false; while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); boolean numerical = isASCIIDigit(token.charAt(0)); // String field if (!numerical) { if (!found_month) { String tokenLowerCase = token.toLowerCase(); for (int i = 0; i < kMonthsLen; ++i) { // Match prefix, so we could match January, etc if (tokenLowerCase.startsWith(kMonths[i])) { exploded.month = i + 1; found_month = true; break; } } } else { // If we've gotten here, it means we've already found and parsed our // month, and we have another string, which we would expect to be the // the time zone name. According to the RFC and my experiments with // how sites format their expirations, we don't have much of a reason // to support timezones. We don't want to ever barf on user input, // but this DCHECK should pass for well-formed data. // DCHECK(token == "GMT"); } // Numeric field w/ a colon } else if (token.indexOf(':') != -1) { if (!found_time) { Matcher m = timePattern.matcher(token); if (m.matches()) { try { short hour = Short.parseShort(m.group(1)); short minute = Short.parseShort(m.group(2)); short second = Short.parseShort(m.group(3)); exploded.hour = hour; exploded.minute = minute; exploded.second = second; found_time = true; } catch (NumberFormatException e) { } } } else { // We should only ever encounter one time-like thing. If we're here, // it means we've found a second, which shouldn't happen. We keep // the first. This check should be ok for well-formed input: // NOTREACHED(); } // Numeric field } else { // Overflow with atoi() is unspecified, so we enforce a max length. if (!found_day_of_month && token.length() <= 2) { try { exploded.day_of_month = Integer.parseInt(token); found_day_of_month = true; } catch (NumberFormatException e) { } } else if (!found_year && token.length() <= 5) { try { exploded.year = Integer.parseInt(token); found_year = true; } catch (NumberFormatException e) { } } else { // If we're here, it means we've either found an extra numeric field, // or a numeric field which was too long. For well-formed input, the // following check would be reasonable: // NOTREACHED(); } } } if (!found_day_of_month || !found_month || !found_time || !found_year) { // We didn't find all of the fields we need. For well-formed input, the // following check would be reasonable: // NOTREACHED() << "Cookie parse expiration failed: " << time_string; return -1; } // Normalize the year to expand abbreviated years to the full year. if (exploded.year >= 69 && exploded.year <= 99) exploded.year += 1900; if (exploded.year >= 0 && exploded.year <= 68) exploded.year += 2000; // If our values are within their correct ranges, we got our time. if (exploded.day_of_month >= 1 && exploded.day_of_month <= 31 && exploded.month >= 1 && exploded.month <= 12 && exploded.year >= 1601 && exploded.year <= 30827 && exploded.hour <= 23 && exploded.minute <= 59 && exploded.second <= 59) { Calendar gmtCalendar = Calendar.getInstance(TimeZone.getTimeZone("GMT")); gmtCalendar.set(exploded.year, exploded.month - 1, exploded.day_of_month, exploded.hour, exploded.minute, exploded.second); gmtCalendar.set(Calendar.MILLISECOND, 0); return gmtCalendar.getTimeInMillis(); } // One of our values was out of expected range. For well-formed input, // the following check would be reasonable: // NOTREACHED() << "Cookie exploded expiration failed: " << time_string; return -1; } public static void main(String[] args) { Assert.assertFalse(parseHttpDate("Sun, 22 Nov 2009 01:37:06GMT") == -1); Assert.assertFalse(parseHttpDate("Sun, 22 Nov 2009 01:37:06 GMT") == -1); } }