/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutchbase.crawl; import java.util.Set; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.io.Text; import org.apache.nutchbase.util.hbase.RowPart; /** * This interface defines the contract for implementations that manipulate * fetch times and re-fetch intervals. * * @author Andrzej Bialecki */ public interface FetchScheduleHbase extends Configurable { /** * Initialize fetch schedule related data. Implementations should at least * set the <code>fetchTime</code> and <code>fetchInterval</code>. The default * implementation set the <code>fetchTime</code> to now, using the * default <code>fetchInterval</code>. * * @param url URL of the page. * @param row url's row */ public void initializeSchedule(String url, RowPart row); /** * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a * successfully fetched page. * Implementations may use supplied arguments to support different re-fetching * schedules. * * @param url url of the page * @param row url's row * @param prevFetchTime previous value of fetch time, or -1 if not available * @param prevModifiedTime previous value of modifiedTime, or -1 if not available * @param fetchTime the latest time, when the page was recently re-fetched. Most FetchSchedule * implementations should update the value in {@param datum} to something greater than this value. * @param modifiedTime last time the content was modified. This information comes from * the protocol implementations, or is set to < 0 if not available. Most FetchSchedule * implementations should update the value in {@param datum} to this value. * @param state if {@link #STATUS_MODIFIED}, then the content is considered to be "changed" before the * <code>fetchTime</code>, if {@link #STATUS_NOTMODIFIED} then the content is known to be unchanged. * This information may be obtained by comparing page signatures before and after fetching. If this * is set to {@link #STATUS_UNKNOWN}, then it is unknown whether the page was changed; implementations * are free to follow a sensible default behavior. */ public void setFetchSchedule(String url, RowPart row, long prevFetchTime, long prevModifiedTime, long fetchTime, long modifiedTime, int state); /** * This method specifies how to schedule refetching of pages * marked as GONE. Default implementation increases fetchInterval by 50%, * and if it exceeds the <code>maxInterval</code> it calls * {@link #forceRefetch(Text, CrawlDatum, boolean)}. * @param url URL of the page * @param row url's row */ public void setPageGoneSchedule(String url, RowPart row, long prevFetchTime, long prevModifiedTime, long fetchTime); /** * This method adjusts the fetch schedule if fetching needs to be * re-tried due to transient errors. The default implementation * sets the next fetch time 1 day in the future and increases the * retry counter. * @param url URL of the page * @param row url's row * @param prevFetchTime previous fetch time * @param prevModifiedTime previous modified time * @param fetchTime current fetch time */ public void setPageRetrySchedule(String url, RowPart row, long prevFetchTime, long prevModifiedTime, long fetchTime); /** * Calculates last fetch time of the given CrawlDatum. * @return the date as a long. */ public long calculateLastFetchTime(RowPart row); /** * This method provides information whether the page is suitable for * selection in the current fetchlist. NOTE: a true return value does not * guarantee that the page will be fetched, it just allows it to be * included in the further selection process based on scores. The default * implementation checks <code>fetchTime</code>, if it is higher than the * {@param curTime} it returns false, and true otherwise. It will also * check that fetchTime is not too remote (more than <code>maxInterval</code), * in which case it lowers the interval and returns true. * @param url URL of the page * @param row url's row * @param curTime reference time (usually set to the time when the * fetchlist generation process was started). * @return true, if the page should be considered for inclusion in the current * fetchlist, otherwise false. */ public boolean shouldFetch(String url, RowPart row, long curTime); /** * This method resets fetchTime, fetchInterval, modifiedTime and * page signature, so that it forces refetching. * @param url URL of the page * @param row url's row * @param asap if true, force refetch as soon as possible - this sets * the fetchTime to now. If false, force refetch whenever the next fetch * time is set. */ public void forceRefetch(String url, RowPart row, boolean asap); public Set<String> getColumns(); }