/** * Licensed to DigitalPebble Ltd under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * DigitalPebble licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.digitalpebble.stormcrawler.persistence; import java.util.Calendar; import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Optional; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.digitalpebble.stormcrawler.Constants; import com.digitalpebble.stormcrawler.Metadata; import com.digitalpebble.stormcrawler.util.ConfUtils; /** * Schedules a nextFetchDate based on the configuration **/ public class DefaultScheduler extends Scheduler { /** Date far in the future used for never-refetch items. */ public static final Date NEVER = new Calendar.Builder() .setCalendarType("iso8601").setDate(2099, Calendar.DECEMBER, 31) .build().getTime(); // fetch intervals in minutes private int defaultfetchInterval; private int fetchErrorFetchInterval; private int errorFetchInterval; private CustomInterval[] customIntervals; /* * (non-Javadoc) * * @see * com.digitalpebble.stormcrawler.persistence.Scheduler#init(java.util.Map) */ @SuppressWarnings("rawtypes") @Override public void init(Map stormConf) { defaultfetchInterval = ConfUtils.getInt(stormConf, Constants.defaultFetchIntervalParamName, 1440); fetchErrorFetchInterval = ConfUtils.getInt(stormConf, Constants.fetchErrorFetchIntervalParamName, 120); errorFetchInterval = ConfUtils.getInt(stormConf, Constants.errorFetchIntervalParamName, 44640); // loads any custom key values // must be of form fetchInterval(.STATUS)?.keyname=value // e.g. fetchInterval.isFeed=true // e.g. fetchInterval.FETCH_ERROR.isFeed=true Map<String, CustomInterval> intervals = new HashMap<>(); Pattern pattern = Pattern.compile("^fetchInterval(\\..+)?\\.(.+)=(.+)"); Iterator<String> keyIter = stormConf.keySet().iterator(); while (keyIter.hasNext()) { String key = keyIter.next(); Matcher m = pattern.matcher(key); if (!m.matches()) { continue; } Status status = null; // was a status specified? if (m.group(1) != null) { status = Status.valueOf(m.group(1).substring(1)); } String mdname = m.group(2); String mdvalue = m.group(3); int customInterval = ConfUtils.getInt(stormConf, key, -1); if (customInterval != -1) { CustomInterval interval = intervals.get(mdname + mdvalue); if (interval == null) { interval = new CustomInterval(mdname, mdvalue, status, customInterval); } else { interval.setDurationForStatus(status, customInterval); } // specify particular interval for this status intervals.put(mdname + mdvalue, interval); } } customIntervals = intervals.values().toArray( new CustomInterval[intervals.size()]); } /* * (non-Javadoc) * * @see com.digitalpebble.stormcrawler.persistence.Scheduler#schedule(com. * digitalpebble. stormcrawler.persistence .Status, * com.digitalpebble.stormcrawler.Metadata) */ @Override public Date schedule(Status status, Metadata metadata) { int minutesIncrement = 0; Optional<Integer> customInterval = checkCustomInterval(metadata, status); if (customInterval.isPresent()) { minutesIncrement = customInterval.get(); } else { switch (status) { case FETCHED: minutesIncrement = defaultfetchInterval; break; case FETCH_ERROR: minutesIncrement = fetchErrorFetchInterval; break; case ERROR: minutesIncrement = errorFetchInterval; break; case REDIRECTION: minutesIncrement = defaultfetchInterval; break; default: // leave it to now e.g. DISCOVERED } } // a value of -1 means never fetch // we use a conventional value if (minutesIncrement == -1) { return NEVER; } Calendar cal = Calendar.getInstance(); cal.add(Calendar.MINUTE, minutesIncrement); return cal.getTime(); } /** * Returns the first matching custom interval **/ protected final Optional<Integer> checkCustomInterval(Metadata metadata, Status s) { if (customIntervals == null) return Optional.empty(); for (CustomInterval customInterval : customIntervals) { String[] values = metadata.getValues(customInterval.key); if (values == null) { continue; } for (String v : values) { if (v.equals(customInterval.value)) { return customInterval.getDurationForStatus(s); } } } return Optional.empty(); } private class CustomInterval { private String key; private String value; private Map<Status, Integer> durationPerStatus; private Integer defaultDuration = null; private CustomInterval(String key, String value, Status status, int minutes) { this.key = key; this.value = value; this.durationPerStatus = new HashMap<>(); setDurationForStatus(status, minutes); } private void setDurationForStatus(Status s, int minutes) { if (s == null) { defaultDuration = minutes; } else { this.durationPerStatus.put(s, minutes); } } private Optional<Integer> getDurationForStatus(Status s) { // do we have a specific value for this status? Integer customD = durationPerStatus.get(s); if (customD != null) { return Optional.of(customD); } // is there a default one set? if (defaultDuration != null) { return Optional.of(defaultDuration); } // no default value or custom one for that status return Optional.empty(); } } }