/* $Id: ThrottleDescription.java 988245 2010-08-23 18:39:35Z kwright $ */ /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.manifoldcf.crawler.connectors.webcrawler; import org.apache.manifoldcf.core.interfaces.*; import org.apache.manifoldcf.connectorcommon.interfaces.*; import java.util.*; import java.util.regex.*; /** This class describes complex throttling criteria pulled from a configuration. * The data contained is organized by regular expression performed on a bin. What we store * for each regular expression is a Pattern, for efficiency. * * This structure deals with bandwidth limits, maximum connection limits, and maximum fetch rate * limits. Average fetch rate limits are handled in the infrastructure. * * Generally it is a good thing to limit the number of regexps that need to be evaluated against * any given bin value as much as possible. For that reason I've organized this structure * accordingly. */ public class ThrottleDescription implements IThrottleSpec { public static final String _rcsid = "@(#)$Id: ThrottleDescription.java 988245 2010-08-23 18:39:35Z kwright $"; /** This is the hash that contains everything. It's keyed by the regexp string itself. * Values are ThrottleItem's. */ protected Map<String,ThrottleItem> patternHash = new HashMap<String,ThrottleItem>(); /** Constructor. Build the description from the ConfigParams. */ public ThrottleDescription(ConfigParams configData) throws ManifoldCFException { // Scan, looking for bin description nodes int i = 0; while (i < configData.getChildCount()) { ConfigNode node = configData.getChild(i++); if (node.getType().equals(WebcrawlerConfig.NODE_BINDESC)) { // Get the bin regexp String binDescription = node.getAttributeValue(WebcrawlerConfig.ATTR_BINREGEXP); // Get the case sensitivity flag String caseSensitive = node.getAttributeValue(WebcrawlerConfig.ATTR_INSENSITIVE); boolean isInsensitive = false; if (caseSensitive != null && caseSensitive.equalsIgnoreCase("true")) isInsensitive = true; // Now, go through this node's children looking for values we know about. Integer maxConnectionCount = null; Double minMillisecondsPerByte = null; Long minMillisecondsPerFetch = null; int j = 0; while (j < node.getChildCount()) { ConfigNode childNode = node.getChild(j++); if (childNode.getType().equals(WebcrawlerConfig.NODE_MAXCONNECTIONS)) { String value = childNode.getAttributeValue(WebcrawlerConfig.ATTR_VALUE); if (value != null) { try { maxConnectionCount = new Integer(value); } catch (NumberFormatException e) { throw new ManifoldCFException("Bad number",e); } } } else if (childNode.getType().equals(WebcrawlerConfig.NODE_MAXKBPERSECOND)) { String value = childNode.getAttributeValue(WebcrawlerConfig.ATTR_VALUE); if (value != null) { try { double kbPerSecond = new Double(value).doubleValue(); if (kbPerSecond > 0) minMillisecondsPerByte = new Double(1.0/(double)kbPerSecond); } catch (NumberFormatException e) { throw new ManifoldCFException("Bad number",e); } } } else if (childNode.getType().equals(WebcrawlerConfig.NODE_MAXFETCHESPERMINUTE)) { String value = childNode.getAttributeValue(WebcrawlerConfig.ATTR_VALUE); if (value != null) { try { double fetchesPerMinute = new Double(value).doubleValue(); if (fetchesPerMinute > 0) minMillisecondsPerFetch = new Long((long)(((double)60000.0)/(double)fetchesPerMinute)); } catch (NumberFormatException e) { throw new ManifoldCFException("Bad number: "+e.getMessage(),e); } } } } try { int flags = Pattern.UNICODE_CASE; if (isInsensitive) flags |= Pattern.CASE_INSENSITIVE; Pattern p; try { p = Pattern.compile(binDescription,flags); } catch (java.util.regex.PatternSyntaxException e) { throw new ManifoldCFException("Bin regular expression '"+binDescription+"' is illegal: "+e.getMessage(),e); } ThrottleItem ti = new ThrottleItem(p); ti.setMaxOpenConnections(maxConnectionCount); ti.setMinimumMillisecondsPerByte(minMillisecondsPerByte); ti.setMinimumMillisecondsPerFetch(minMillisecondsPerFetch); patternHash.put(binDescription,ti); } catch (PatternSyntaxException e) { throw new ManifoldCFException("Bad pattern syntax in '"+binDescription+"'",e); } } } } /** Given a bin name, find the max open connections to use for that bin. *@return Integer.MAX_VALUE if no limit found. */ @Override public int getMaxOpenConnections(String binName) { // Go through the regexps and match; for each match, find the maximum possible. int maxCount = -1; for (ThrottleItem ti : patternHash.values()) { Integer limit = ti.getMaxOpenConnections(); if (limit != null) { Pattern p = ti.getPattern(); Matcher m = p.matcher(binName); if (m.find()) { if (maxCount == -1 || limit.intValue() > maxCount) maxCount = limit.intValue(); } } } if (maxCount == -1) maxCount = Integer.MAX_VALUE; else if (maxCount == 0) maxCount = 1; return maxCount; } /** Look up minimum milliseconds per byte for a bin. *@return 0.0 if no limit found. */ @Override public double getMinimumMillisecondsPerByte(String binName) { // Go through the regexps and match; for each match, find the maximum possible. double minMilliseconds = 0.0; boolean seenSomething = false; for (ThrottleItem ti : patternHash.values()) { Double limit = ti.getMinimumMillisecondsPerByte(); if (limit != null) { Pattern p = ti.getPattern(); Matcher m = p.matcher(binName); if (m.find()) { if (seenSomething == false || limit.doubleValue() < minMilliseconds) { seenSomething = true; minMilliseconds = limit.doubleValue(); } } } } return minMilliseconds; } /** Look up minimum milliseconds for a fetch for a bin. *@return 0 if no limit found. */ @Override public long getMinimumMillisecondsPerFetch(String binName) { // Go through the regexps and match; for each match, find the maximum possible. long minMilliseconds = 0L; boolean seenSomething = false; for (ThrottleItem ti : patternHash.values()) { Long limit = ti.getMinimumMillisecondsPerFetch(); if (limit != null) { Pattern p = ti.getPattern(); Matcher m = p.matcher(binName); if (m.find()) { if (seenSomething == false || limit.longValue() < minMilliseconds) { seenSomething = true; minMilliseconds = limit.longValue(); } } } } return minMilliseconds; } /** Class representing an individual throttle item. */ protected static class ThrottleItem { /** The bin-matching pattern. */ protected final Pattern pattern; /** The minimum milliseconds between bytes, or null if no limit. */ protected Double minimumMillisecondsPerByte = null; /** The minimum milliseconds per fetch, or null if no limit */ protected Long minimumMillisecondsPerFetch = null; /** The maximum open connections, or null if no limit. */ protected Integer maxOpenConnections = null; /** Constructor. */ public ThrottleItem(Pattern p) { pattern = p; } /** Get the pattern. */ public Pattern getPattern() { return pattern; } /** Set minimum milliseconds per byte. */ public void setMinimumMillisecondsPerByte(Double value) { minimumMillisecondsPerByte = value; } /** Get minimum milliseconds per byte. */ public Double getMinimumMillisecondsPerByte() { return minimumMillisecondsPerByte; } /** Set minimum milliseconds per fetch */ public void setMinimumMillisecondsPerFetch(Long value) { minimumMillisecondsPerFetch = value; } /** Get minimum milliseconds per fetch */ public Long getMinimumMillisecondsPerFetch() { return minimumMillisecondsPerFetch; } /** Set maximum open connections. */ public void setMaxOpenConnections(Integer value) { maxOpenConnections = value; } /** Get maximum open connections. */ public Integer getMaxOpenConnections() { return maxOpenConnections; } } }