/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.modules.recrawl; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.HashMap; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.archive.modules.CrawlURI; /** * collection of utility methods useful for loading and storing crawl history. * <p>note that these methods can also be useful for non-HBase crawl history storage.</p> * @contributor kenji * */ public class FetchHistoryHelper { private static final Log logger = LogFactory.getLog(FetchHistoryHelper.class); /** * key for storing timestamp in crawl history map. */ public static final String A_TIMESTAMP = ".ts"; /** * returns a Map to store recrawl data, positioned properly in CrawlURI's * fetch history array, according to {@code timestamp}. this makes it possible * to import crawl history data from multiple sources. * @param uri target {@link CrawlURI} * @param timestamp timestamp (in ms) of crawl history to be added. * @return Map object to store recrawl data, or null if {@code timestamp} is older * than existing crawl history entry and there's no room for it. * @see #setHistoryLength(int) */ @SuppressWarnings("unchecked") public static Map<String, Object> getFetchHistory(CrawlURI uri, long timestamp, int historyLength) { Map<String, Object> data = uri.getData(); Map<String, Object>[] history = (Map[])data.get(RecrawlAttributeConstants.A_FETCH_HISTORY); if (history == null) { // there's no history records at all. // FetchHistoryProcessor assumes history is HashMap[], not Map[]. history = new HashMap[historyLength]; data.put(RecrawlAttributeConstants.A_FETCH_HISTORY, history); } for (int i = 0; i < history.length; i++) { if (history[i] == null) { history[i] = new HashMap<String, Object>(); history[i].put(A_TIMESTAMP, timestamp); return history[i]; } Object ts = history[i].get(A_TIMESTAMP); // no timestamp value is regarded as older than anything. if (!(ts instanceof Long) || timestamp > (Long)ts) { if (i < history.length - 2) { System.arraycopy(history, i, history, i + 1, history.length - i - 1); } else if (i == history.length - 2) { history[i + 1] = history[i]; } history[i] = new HashMap<String, Object>(); history[i].put(A_TIMESTAMP, timestamp); return history[i]; } } return null; } protected static final DateFormat HTTP_DATE_FORMAT = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz"); protected FetchHistoryHelper() { } /** * converts time in HTTP Date format {@code dateStr} to seconds * since epoch. * @param dateStr time in HTTP Date format. * @return seconds since epoch */ public static long parseHttpDate(String dateStr) { synchronized (HTTP_DATE_FORMAT) { try { Date d = HTTP_DATE_FORMAT.parse(dateStr); return d.getTime() / 1000; } catch (ParseException ex) { if (logger.isDebugEnabled()) logger.debug("bad HTTP DATE: " + dateStr); return 0; } } } public static String formatHttpDate(long time) { synchronized (HTTP_DATE_FORMAT) { // format(Date) is not thread safe either return HTTP_DATE_FORMAT.format(new Date(time * 1000)); } } }