/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.crawl; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.io.Text; /** * This interface defines the contract for implementations that manipulate fetch * times and re-fetch intervals. * * @author Andrzej Bialecki */ public interface FetchSchedule extends Configurable { /** It is unknown whether page was changed since our last visit. */ public static final int STATUS_UNKNOWN = 0; /** Page is known to have been modified since our last visit. */ public static final int STATUS_MODIFIED = 1; /** Page is known to remain unmodified since our last visit. */ public static final int STATUS_NOTMODIFIED = 2; public static final int SECONDS_PER_DAY = 3600 * 24; /** * Initialize fetch schedule related data. Implementations should at least set * the <code>fetchTime</code> and <code>fetchInterval</code>. The default * implementation set the <code>fetchTime</code> to now, using the default * <code>fetchInterval</code>. * * @param url * URL of the page. * * @param datum * datum instance to be initialized. * * @return adjusted page information, including all original information. * NOTE: this may be a different instance than @see CrawlDatum, but * implementations should make sure that it contains at least all * information from @see CrawlDatum. */ public CrawlDatum initializeSchedule(Text url, CrawlDatum datum); /** * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a * successfully fetched page. Implementations may use supplied arguments to * support different re-fetching schedules. * * @param url * url of the page * * @param datum * page description to be adjusted. NOTE: this instance, passed by * reference, may be modified inside the method. * * @param prevFetchTime * previous value of fetch time, or 0 if not available. * * @param prevModifiedTime * previous value of modifiedTime, or 0 if not available. * * @param fetchTime * the latest time, when the page was recently re-fetched. Most * FetchSchedule implementations should update the value in @see * CrawlDatum to something greater than this value. * * @param modifiedTime * last time the content was modified. This information comes from * the protocol implementations, or is set to < 0 if not available. * Most FetchSchedule implementations should update the value in @see * CrawlDatum to this value. * * @param state * if {@link #STATUS_MODIFIED}, then the content is considered to be * "changed" before the <code>fetchTime</code>, if * {@link #STATUS_NOTMODIFIED} then the content is known to be * unchanged. This information may be obtained by comparing page * signatures before and after fetching. If this is set to * {@link #STATUS_UNKNOWN}, then it is unknown whether the page was * changed; implementations are free to follow a sensible default * behavior. * * @return adjusted page information, including all original information. * NOTE: this may be a different instance than @see CrawlDatum, but * implementations should make sure that it contains at least all * information from @see CrawlDatum}. */ public CrawlDatum setFetchSchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, long fetchTime, long modifiedTime, int state); /** * This method specifies how to schedule refetching of pages marked as GONE. * Default implementation increases fetchInterval by 50%, and if it exceeds * the <code>maxInterval</code> it calls * {@link #forceRefetch(Text, CrawlDatum, boolean)}. * * @param url * URL of the page * * @param datum * datum instance to be adjusted. * * @return adjusted page information, including all original information. * NOTE: this may be a different instance than @see CrawlDatum, but * implementations should make sure that it contains at least all * information from @see CrawlDatum. */ public CrawlDatum setPageGoneSchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, long fetchTime); /** * This method adjusts the fetch schedule if fetching needs to be re-tried due * to transient errors. The default implementation sets the next fetch time 1 * day in the future and increases the retry counter. * * @param url * URL of the page. * * @param datum * page information. * * @param prevFetchTime * previous fetch time. * * @param prevModifiedTime * previous modified time. * * @param fetchTime * current fetch time. * * @return adjusted page information, including all original information. * NOTE: this may be a different instance than @see CrawlDatum, but * implementations should make sure that it contains at least all * information from @see CrawlDatum. */ public CrawlDatum setPageRetrySchedule(Text url, CrawlDatum datum, long prevFetchTime, long prevModifiedTime, long fetchTime); /** * Calculates last fetch time of the given CrawlDatum. * * @return the date as a long. */ public long calculateLastFetchTime(CrawlDatum datum); /** * This method provides information whether the page is suitable for selection * in the current fetchlist. NOTE: a true return value does not guarantee that * the page will be fetched, it just allows it to be included in the further * selection process based on scores. The default implementation checks * <code>fetchTime</code>, if it is higher than the curTime it returns false, * and true otherwise. It will also check that fetchTime is not too remote * (more than <code>maxInterval</code>), in which case it lowers the interval * and returns true. * * @param url * URL of the page. * * @param datum * datum instance. * * @param curTime * reference time (usually set to the time when the fetchlist * generation process was started). * * @return true, if the page should be considered for inclusion in the current * fetchlist, otherwise false. */ public boolean shouldFetch(Text url, CrawlDatum datum, long curTime); /** * This method resets fetchTime, fetchInterval, modifiedTime and page * signature, so that it forces refetching. * * @param url * URL of the page. * * @param datum * datum instance. * * @param asap * if true, force refetch as soon as possible - this sets the * fetchTime to now. If false, force refetch whenever the next fetch * time is set. * * @return adjusted page information, including all original information. * NOTE: this may be a different instance than @see CrawlDatum, but * implementations should make sure that it contains at least all * information from @see CrawlDatum. */ public CrawlDatum forceRefetch(Text url, CrawlDatum datum, boolean asap); }