/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutchbase.crawl; import java.util.HashSet; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.nutch.crawl.FetchSchedule; import org.apache.nutchbase.util.hbase.RowPart; import org.apache.nutchbase.util.hbase.TableColumns; /** * This class provides common methods for implementations of * {@link FetchSchedule}. * * @author Andrzej Bialecki */ public abstract class AbstractFetchScheduleHbase extends Configured implements FetchScheduleHbase { private static final Log LOG = LogFactory.getLog(AbstractFetchScheduleHbase.class); protected int defaultInterval; protected int maxInterval; private static final Set<String> COLUMNS = new HashSet<String>(); static { COLUMNS.add(TableColumns.FETCH_TIME_STR); COLUMNS.add(TableColumns.RETRIES_STR); COLUMNS.add(TableColumns.FETCH_INTERVAL_STR); } public AbstractFetchScheduleHbase() { super(null); } public AbstractFetchScheduleHbase(Configuration conf) { super(conf); } public void setConf(Configuration conf) { super.setConf(conf); if (conf == null) return; int oldDefaultInterval = conf.getInt("db.default.fetch.interval", 0); defaultInterval = conf.getInt("db.fetch.interval.default", 0); if (oldDefaultInterval > 0 && defaultInterval == 0) { defaultInterval = oldDefaultInterval * FetchSchedule.SECONDS_PER_DAY; } int oldMaxInterval = conf.getInt("db.max.fetch.interval", 0); maxInterval = conf.getInt("db.fetch.interval.max", 0 ); if (oldMaxInterval > 0 && maxInterval == 0) { maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY; } LOG.info("defaultInterval=" + defaultInterval); LOG.info("maxInterval=" + maxInterval); } /** * Initialize fetch schedule related data. Implementations should at least * set the <code>fetchTime</code> and <code>fetchInterval</code>. The default * implementation sets the <code>fetchTime</code> to now, using the * default <code>fetchInterval</code>. * * @param url URL of the page. * @param row url's row */ public void initializeSchedule(String url, RowPart row) { row.setFetchTime(System.currentTimeMillis()); row.setFetchInterval(defaultInterval); row.setRetriesSinceFetch(0); } /** * Sets the <code>fetchInterval</code> and <code>fetchTime</code> on a * successfully fetched page. NOTE: this implementation resets the * retry counter - extending classes should call super.setFetchSchedule() to * preserve this behavior. */ public void setFetchSchedule(String url, RowPart row, long prevFetchTime, long prevModifiedTime, long fetchTime, long modifiedTime, int state) { row.setRetriesSinceFetch(0); } /** * This method specifies how to schedule refetching of pages * marked as GONE. Default implementation increases fetchInterval by 50%, * and if it exceeds the <code>maxInterval</code> it calls * {@link #forceRefetch(Text, CrawlDatum, boolean)}. * @param url URL of the page * @param row url's row * @return adjusted page information, including all original information. * NOTE: this may be a different instance than {@param datum}, but * implementations should make sure that it contains at least all * information from {@param datum}. */ public void setPageGoneSchedule(String url, RowPart row, long prevFetchTime, long prevModifiedTime, long fetchTime) { // no page is truly GONE ... just increase the interval by 50% // and try much later. int newFetchInterval = (int) (row.getFetchInterval() * 1.5f); row.setFetchInterval(newFetchInterval); row.setFetchTime(fetchTime + newFetchInterval * 1000L); if (maxInterval < newFetchInterval) forceRefetch(url, row, false); } /** * This method adjusts the fetch schedule if fetching needs to be * re-tried due to transient errors. The default implementation * sets the next fetch time 1 day in the future and increases * the retry counter. * @param url URL of the page * @param row url's row * @param prevFetchTime previous fetch time * @param prevModifiedTime previous modified time * @param fetchTime current fetch time */ public void setPageRetrySchedule(String url, RowPart row, long prevFetchTime, long prevModifiedTime, long fetchTime) { row.setFetchTime(fetchTime + (long)FetchSchedule.SECONDS_PER_DAY); int oldRetries = row.getRetriesSinceFetch(); row.setRetriesSinceFetch(oldRetries + 1); } /** * This method return the last fetch time of the CrawlDatum * @return the date as a long. */ public long calculateLastFetchTime(RowPart row) { return row.getFetchTime() - row.getFetchInterval() * 1000L; } /** * This method provides information whether the page is suitable for * selection in the current fetchlist. NOTE: a true return value does not * guarantee that the page will be fetched, it just allows it to be * included in the further selection process based on scores. The default * implementation checks <code>fetchTime</code>, if it is higher than the * {@param curTime} it returns false, and true otherwise. It will also * check that fetchTime is not too remote (more than <code>maxInterval</code), * in which case it lowers the interval and returns true. * @param url URL of the page * @param row url's row * @param curTime reference time (usually set to the time when the * fetchlist generation process was started). * @return true, if the page should be considered for inclusion in the current * fetchlist, otherwise false. */ public boolean shouldFetch(String url, RowPart row, long curTime) { // pages are never truly GONE - we have to check them from time to time. // pages with too long fetchInterval are adjusted so that they fit within // maximum fetchInterval (segment retention period). long fetchTime = row.getFetchTime(); if (fetchTime - curTime > maxInterval * 1000L) { row.setFetchInterval(Math.round(maxInterval * 0.9f)); row.setFetchTime(curTime); } if (fetchTime > curTime) { return false; // not time yet } return true; } /** * This method resets fetchTime, fetchInterval, modifiedTime, * retriesSinceFetch and page signature, so that it forces refetching. * @param url URL of the page * @param row url's row * @param asap if true, force refetch as soon as possible - this sets * the fetchTime to now. If false, force refetch whenever the next fetch * time is set. */ public void forceRefetch(String url, RowPart row, boolean asap) { // reduce fetchInterval so that it fits within the max value if (row.getFetchInterval() > maxInterval) row.setFetchInterval(Math.round(maxInterval * 0.9f)); row.setStatus(CrawlDatumHbase.STATUS_UNFETCHED); row.setRetriesSinceFetch(0); // TODO: row.setSignature(null) ?? row.setModifiedTime(0L); if (asap) row.setFetchTime(System.currentTimeMillis()); } public Set<String> getColumns() { return COLUMNS; } }