/* $Id: WebcrawlerConfig.java 995042 2010-09-08 13:10:06Z kwright $ */ /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.manifoldcf.crawler.connectors.webcrawler; /** Constants for the Webcrawler connector configuration. */ public class WebcrawlerConfig { public static final String _rcsid = "@(#)$Id: WebcrawlerConfig.java 995042 2010-09-08 13:10:06Z kwright $"; // Constants describing the configuration structure. This structure describes the "how" of // fetching page data - e.g. bandwidth and fetch constraints, adherance to robots conventions, // etc. // For the throttling part of the connector, the scheduler handles average fetch-rate limits. // The per-connection configuration describes the maximum number of connections per some user-defined criteria, as // well as bandwidth maximums and fetch rate absolute maximums. // // In detail: // // 1) The robots conventions; // 2) Bandwidth limits in KB/sec, based on regular expressions done on the bins; // 3) Email address (so people can whine to somebody about our crawler); // 4) Maximum number of connections per host, based on a regular expression done on the bins a document belongs to. // 5) Authentication information (NTLM and basic auth only), based on regexp of a document's URL. // 6) SSL trust store certificates, trusted on the basis of a regexp of a document's URL. /** Robots usage (a parameter) */ public static final String PARAMETER_ROBOTSUSAGE = "Robots usage"; /** Meta robots tags usage (a parameter) */ public static final String PARAMETER_META_ROBOTS_TAGS_USAGE = "Meta robots tags usage"; /** Email (a parameter) */ public static final String PARAMETER_EMAIL = "Email address"; /** Proxy host name (parameter) */ public static final String PARAMETER_PROXYHOST = "Proxy host"; /** Proxy port (parameter) */ public static final String PARAMETER_PROXYPORT = "Proxy port"; /** Proxy auth domain (parameter) */ public static final String PARAMETER_PROXYAUTHDOMAIN = "Proxy authentication domain"; /** Proxy auth username (parameter) */ public static final String PARAMETER_PROXYAUTHUSERNAME = "Proxy authentication user name"; /** Proxy auth password (parameter) */ public static final String PARAMETER_PROXYAUTHPASSWORD = "Proxy authentication password"; /** The bin description node */ public static final String NODE_BINDESC = "bindesc"; /** The bin regular expression */ public static final String ATTR_BINREGEXP = "binregexp"; /** Whether the match is case insensitive */ public static final String ATTR_INSENSITIVE = "caseinsensitive"; /** The max connections node */ public static final String NODE_MAXCONNECTIONS = "maxconnections"; /** The bandwidth node */ public static final String NODE_MAXKBPERSECOND = "maxkbpersecond"; /** The max fetch rate node */ public static final String NODE_MAXFETCHESPERMINUTE = "maxfetchesperminute"; /** The value attribute (used for maxconnections and maxkbpersecond) */ public static final String ATTR_VALUE = "value"; /** Access control description node */ public static final String NODE_ACCESSCREDENTIAL = "accesscredential"; /** Regexp for access control node */ public static final String ATTR_URLREGEXP = "urlregexp"; /** Type of security */ public static final String ATTR_TYPE = "type"; /** Type value for basic authentication */ public static final String ATTRVALUE_BASIC = "basic"; /** Type value for NTLM authentication */ public static final String ATTRVALUE_NTLM = "ntlm"; /** Type value for session-based authentication */ public static final String ATTRVALUE_SESSION = "session"; /** Domain/realm part of credentials (if any) */ public static final String ATTR_DOMAIN = "domain"; /** Username part of credentials */ public static final String ATTR_USERNAME = "username"; /** Password part of credentials */ public static final String ATTR_PASSWORD = "password"; /** Authentication page description node */ public static final String NODE_AUTHPAGE = "authpage"; /** Authentication page type: Form */ public static final String ATTRVALUE_FORM = "form"; /** Authentication page type: Link */ public static final String ATTRVALUE_LINK = "link"; /** Authentication page type: Redirection */ public static final String ATTRVALUE_REDIRECTION = "redirection"; /** Authentication page type: Access */ public static final String ATTRVALUE_CONTENT = "content"; /** Form name or link target regexp for authentication page */ public static final String ATTR_MATCHREGEXP = "match"; /** URL to fetch next in a sequence (an override) */ public static final String ATTR_OVERRIDETARGETURL = "overridetargeturl"; /** Authentication parameter node */ public static final String NODE_AUTHPARAMETER = "authparameter"; /** Authentication parameter name regexp */ public static final String ATTR_NAMEREGEXP = "name"; /** Trust store description node */ public static final String NODE_TRUST = "trust"; /** Trust store section of authentication record */ public static final String ATTR_TRUSTSTORE = "truststore"; /** "Trust everything" attribute - replacing truststore if set to 'true' */ public static final String ATTR_TRUSTEVERYTHING = "trusteverything"; // Constants used in the document specification part of the configuration structure. // This describes the "what" of the job. /** Map entry specification node. Has two attributes: 'match' and 'map'. */ public static final String NODE_MAP = "map"; /** The seeds node. The value of this node contains the seeds, as a large * text area. */ public static final String NODE_SEEDS = "seeds"; /** Include regexps node. The value of this node contains the regexps that * must match the canonical URL in order for that URL to be included in the crawl. These * regexps are newline separated, and # starts a comment. */ public static final String NODE_INCLUDES = "includes"; /** Exclude regexps node. The value of this node contains the regexps that * if any one matches, causes the URL to be excluded from the crawl. These * regexps are newline separated, and # starts a comment. */ public static final String NODE_EXCLUDES = "excludes"; /** Include regexps node. The value of this node contains the regexps that * must match the canonical URL in order for that URL to be included for indexing. These * regexps are newline separated, and # starts a comment. */ public static final String NODE_INCLUDESINDEX = "includesindex"; /** Exclude regexps node. The value of this node contains the regexps that * if any one matches, causes the URL to be excluded from indexing. These * regexps are newline separated, and # starts a comment. */ public static final String NODE_EXCLUDESINDEX = "excludesindex"; /** * Exclude any page containing specified regex in their body from index */ public static final String NODE_EXCLUDESCONTENTINDEX = "excludescontentindex"; /** Limit to seeds. When value attribute is true, only seed domains will be permitted. */ public static final String NODE_LIMITTOSEEDS = "limittoseeds"; /** Canonicalization rule. Attributes are regexp, description, reorder, *javasessionremoval, aspsessionremoval, phpsessionremoval, bvsessionremoval */ public static final String NODE_URLSPEC = "urlspec"; /** Forced acl access token node. Attribute is "token". */ public static final String NODE_ACCESS = "access"; /** Exclude header node. The value of this node lists a single header (in lower case) that * should be excluded from the document metadata */ public static final String NODE_EXCLUDEHEADER = "excludeheader"; /** regexp attribute */ public static final String ATTR_REGEXP = "regexp"; /** description attribute */ public static final String ATTR_DESCRIPTION = "description"; /** reorder attribute */ public static final String ATTR_REORDER = "reorder"; /** javasessionremoval attribute */ public static final String ATTR_JAVASESSIONREMOVAL = "javasessionremoval"; /** aspsessionremoval attribute */ public static final String ATTR_ASPSESSIONREMOVAL = "aspsessionremoval"; /** phpsessionremoval attribute */ public static final String ATTR_PHPSESSIONREMOVAL = "phpsessionremoval"; /** bvsessionremoval attribute */ public static final String ATTR_BVSESSIONREMOVAL = "bvsessionremoval"; /** name attribute */ public static final String ATTR_NAME = "name"; /** token attribute */ public static final String ATTR_TOKEN = "token"; /** Value yes */ public static final String ATTRVALUE_YES = "yes"; /** Value no */ public static final String ATTRVALUE_NO = "no"; /** Value false */ public static final String ATTRVALUE_FALSE = "false"; /** Value true */ public static final String ATTRVALUE_TRUE = "true"; /** Match attribute */ public static final String ATTR_MATCH = "match"; /** Map attribute */ public static final String ATTR_MAP = "map"; }