/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.persistence; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; import java.util.Locale; import java.util.Map; import java.util.Optional; import org.slf4j.LoggerFactory; import com.digitalpebble.stormcrawler.Constants; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.parse.filter.MD5SignatureParseFilter; import com.digitalpebble.stormcrawler.persistence.DefaultScheduler; import com.digitalpebble.stormcrawler.persistence.Status; import com.digitalpebble.stormcrawler.protocol.HttpHeaders; import com.digitalpebble.stormcrawler.util.ConfUtils; /** * Adaptive fetch scheduler, checks by signature comparison whether a re-fetched * page has changed: * <ul> * <li>if yes, shrink the fetch interval up to a minimum fetch interval</li> * <li>if not, increase the fetch interval up to a maximum</li> * </ul> * * <p> * The rate how the fetch interval is incremented or decremented is * configurable. * </p> * * <p> * Note, that this scheduler requires the following metadata: * <dl> * <dt>signature</dt> * <dd>page signature, filled by {@link MD5SignatureParseFilter}</dd> * <dt>signatureOld</dt> * <dd>(temporary) copy of the previous signature, optionally copied by * {@link MD5SignatureParseFilter}</dd> * <dt>fetch.statusCode * <dt> * <dd>HTTP response status code, required to handle "HTTP 304 Not * Modified" responses</dd> * </dl> * and writes the following metadata fields: * <dl> * <dt>fetchInterval</dt> * <dd>current fetch interval</dd> * <dt>signatureChangeDate</dt> * <dd>date when the signature has changed</dd> * <dt>last-modified</dt> * <dd>last-modified time used to send If-Modified-Since HTTP requests, only * written if <code>scheduler.adaptive.setLastModified</code> is true. Same date * string as set in "signatureChangeDate".</dd> * </p> * * <h2>Configuration</h2> * <p> * The following lines show how to configure the adaptive scheduler in the * configuration file (crawler-conf.yaml): * * <pre> * scheduler.class: "com.digitalpebble.stormcrawler.persistence.AdaptiveScheduler" * # set last-modified time ({@link HttpHeaders.LAST_MODIFIED}) used in HTTP If-Modified-Since request header field * scheduler.adaptive.setLastModified: true * # min. interval in minutes (default: 1h) * scheduler.adaptive.fetchInterval.min: 60 * # max. interval in minutes (default: 2 weeks) * scheduler.adaptive.fetchInterval.max: 20160 * # increment and decrement rates (0.0 < rate <= 1.0) * scheduler.adaptive.fetchInterval.rate.incr: .5 * scheduler.adaptive.fetchInterval.rate.decr: .5 * * # required persisted metadata (in addition to other persisted metadata): * metadata.persist: * - ... * - signature * - fetch.statusCode * - fetchInterval * - last-modified * # - signatureOld * # - signatureChangeDate * # Note: "signatureOld" and "signatureChangeDate" are optional, the adaptive * # scheduler will also work if both are temporarily passed and not persisted. * </pre> * * </p> * * <p> * To generate the signature and keep a copy of the last signature, the parse * filters should be configured accordingly: * * <pre> * "com.digitalpebble.stormcrawler.parse.ParseFilters": [ * ..., * { * "class": "com.digitalpebble.stormcrawler.parse.filter.MD5SignatureParseFilter", * "name": "MD5Digest", * "params": { * "useText": "false", * "keyName": "signature", * "keyNameCopy": "signatureOld" * } * } * </pre> * * The order is mandatory: first copy the old signature, than generate the * current one. * </p> */ public class AdaptiveScheduler extends DefaultScheduler { /** * Configuration property (boolean) whether or not to set the * "last-modified" metadata field when a page change was detected * by signature comparison. */ public static final String SET_LAST_MODIFIED = "scheduler.adaptive.setLastModified"; /** * Configuration property (int) to set the minimum fetch interval in * minutes. */ public static final String INTERVAL_MIN = "scheduler.adaptive.fetchInterval.min"; /** * Configuration property (int) to set the maximum fetch interval in * minutes. */ public static final String INTERVAL_MAX = "scheduler.adaptive.fetchInterval.max"; /** * Configuration property (float) to set the increment rate. If a page * hasn't changed when refetched, the fetch interval is multiplied by (1.0 + * incr_rate) until the max. fetch interval is reached. */ public static final String INTERVAL_INC_RATE = "scheduler.adaptive.fetchInterval.rate.incr"; /** * Configuration property (float) to set the decrement rate. If a page has * changed when refetched, the fetch interval is multiplied by (1.0 - * decr_rate). If the fetch interval comes closer to the minimum interval, * the decrementing is slowed down. */ public static final String INTERVAL_DEC_RATE = "scheduler.adaptive.fetchInterval.rate.decr"; /** * Name of the signature key in metadata, must be defined as * "keyName" in the configuration of * {@link com.digitalpebble.stormcrawler.parse.filter.MD5SignatureParseFilter} * . This key must be listed in "metadata.persist". */ public static final String SIGNATURE_KEY = "signature"; /** * Name of key to hold previous signature: a copy, not overwritten by * {@link MD5SignatureParseFilter}, is added by * {@link com.digitalpebble.stormcrawler.parse.filter.SignatureCopyParseFilter} * . This key is a temporary copy, not necessarily persisted in metadata. */ public static final String SIGNATURE_OLD_KEY = "signatureOld"; /** * Key to store the current fetch interval value, must be listed in * "metadata.persist". */ public static final String FETCH_INTERVAL_KEY = "fetchInterval"; /** * Key to store the date when the signature has been changed, must be listed * in "metadata.persist". */ public static final String SIGNATURE_MODIFIED_KEY = "signatureChangeDate"; private static final org.slf4j.Logger LOG = LoggerFactory .getLogger(AdaptiveScheduler.class); protected int defaultfetchInterval; protected int minFetchInterval = 60; protected int maxFetchInterval = 60 * 24 * 14; protected float fetchIntervalDecRate = .5f; protected float fetchIntervalIncRate = .5f; protected boolean setLastModified = false; protected boolean overwriteLastModified = false; /** * Format dates in HTTP headers, cf. <a href= * "https://www.w3.org/Protocols/rfc2616/rfc2616-sec3.html#sec3.3">sec3.3 in * RFC 2616</a>. Used to fill the last-modified metadata field. */ protected SimpleDateFormat httpDateFormat = new SimpleDateFormat( "EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US); @Override @SuppressWarnings({ "rawtypes", "unchecked" }) public void init(Map stormConf) { defaultfetchInterval = ConfUtils.getInt(stormConf, Constants.defaultFetchIntervalParamName, 1440); setLastModified = ConfUtils.getBoolean(stormConf, SET_LAST_MODIFIED, false); minFetchInterval = ConfUtils.getInt(stormConf, INTERVAL_MIN, minFetchInterval); maxFetchInterval = ConfUtils.getInt(stormConf, INTERVAL_MAX, maxFetchInterval); fetchIntervalDecRate = ConfUtils.getFloat(stormConf, INTERVAL_DEC_RATE, fetchIntervalDecRate); fetchIntervalIncRate = ConfUtils.getFloat(stormConf, INTERVAL_INC_RATE, fetchIntervalIncRate); super.init(stormConf); } @Override public Date schedule(Status status, Metadata metadata) { LOG.debug("Scheduling status: {}, metadata: {}", status, metadata); String signature = metadata.getFirstValue(SIGNATURE_KEY); String oldSignature = metadata.getFirstValue(SIGNATURE_OLD_KEY); if (status != Status.FETCHED) { // reset all metadata metadata.remove(SIGNATURE_MODIFIED_KEY); metadata.remove(FETCH_INTERVAL_KEY); metadata.remove(SIGNATURE_KEY); metadata.remove(SIGNATURE_OLD_KEY); // fall-back to DefaultScheduler return super.schedule(status, metadata); } Calendar now = Calendar.getInstance(Locale.ROOT); String signatureModified = metadata .getFirstValue(SIGNATURE_MODIFIED_KEY); boolean changed = false; final String modifiedTimeString = httpDateFormat.format(now.getTime()); if (metadata.getFirstValue("fetch.statusCode").equals("304")) { // HTTP 304 Not Modified // - no new signature calculated because no content fetched // - do not compare persisted signatures } else if (signature == null || oldSignature == null) { // no decision possible by signature comparison if // - document not parsed (intentionally or not) or // - signature not generated or // - old signature not copied // fall-back to DefaultScheduler LOG.debug("No signature for FETCHED page: {}", metadata); return super.schedule(status, metadata); } else if (signature.equals(oldSignature)) { // unchanged } else { // change detected by signature comparison changed = true; signatureModified = modifiedTimeString; if (setLastModified) { metadata.setValue(HttpHeaders.LAST_MODIFIED, modifiedTimeString); } } String fetchInterval = metadata.getFirstValue(FETCH_INTERVAL_KEY); int interval = defaultfetchInterval; if (fetchInterval != null) { interval = Integer.parseInt(fetchInterval); } else { // initialize from DefaultScheduler Optional<Integer> customInterval = super.checkCustomInterval( metadata, status); if (customInterval.isPresent()) { interval = customInterval.get(); } else { interval = defaultfetchInterval; } fetchInterval = Integer.toString(interval); } if (changed) { // shrink fetch interval (slow down decrementing if already close to // the minimum interval) interval = (int) ((1.0f - fetchIntervalDecRate) * interval + fetchIntervalDecRate * minFetchInterval); LOG.debug( "Signature has changed, fetchInterval decreased from {} to {}", fetchInterval, interval); } else { // no change or not modified, increase fetch interval interval = (int) (interval * (1.0f + fetchIntervalIncRate)); if (interval > maxFetchInterval) { interval = maxFetchInterval; } LOG.debug("Unchanged, fetchInterval increased from {} to {}", fetchInterval, interval); // remove old signature (do not keep same signature twice) metadata.remove(SIGNATURE_OLD_KEY); if (signatureModified == null) { signatureModified = modifiedTimeString; } } metadata.setValue(FETCH_INTERVAL_KEY, Integer.toString(interval)); metadata.setValue(SIGNATURE_MODIFIED_KEY, signatureModified); now.add(Calendar.MINUTE, interval); return now.getTime(); } }