/* $Id: CredentialsDescription.java 988245 2010-08-23 18:39:35Z kwright $ */ /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.manifoldcf.crawler.connectors.webcrawler; import org.apache.manifoldcf.core.interfaces.*; import org.apache.manifoldcf.crawler.system.ManifoldCF; import java.util.*; import java.util.regex.*; import org.apache.http.auth.Credentials; import org.apache.http.auth.NTCredentials; import org.apache.http.auth.UsernamePasswordCredentials; /** This class describes credential information pulled from a configuration. * The data contained is organized by regular expression performed on a url. What we store * for each regular expression is a Pattern, for efficiency. * * This structure deals with credentials as applied to a matching set of urls. It handles sequence-based * credentials as well as page-based credentials - that is, session-type authentication descriptions as well * as well as basic/ntlm authentication. (The two are in fact not mutually exclusive!!) * * For page-based credentials, a method is provided that locates the proper credential to use based on the page's url. * * For sequence-based credentials, a different method is provided. This reflects the fact that the underlying functionality * of sequence-based credentials differs enormously from that of page-based. * * Generally it is a good thing to limit the number of regexps that need to be evaluated against * any given url value as much as possible. For that reason I've organized this structure * accordingly. */ public class CredentialsDescription { public static final String _rcsid = "@(#)$Id: CredentialsDescription.java 988245 2010-08-23 18:39:35Z kwright $"; /** This is the hash that contains everything. It's keyed by the regexp string itself. * Values are CredentialsItem objects. */ protected HashMap patternHash = new HashMap(); /** Constructor. Build the description from the ConfigParams. */ public CredentialsDescription(ConfigParams configData) throws ManifoldCFException { // Scan, looking for bin description nodes int i = 0; while (i < configData.getChildCount()) { ConfigNode node = configData.getChild(i++); if (node.getType().equals(WebcrawlerConfig.NODE_ACCESSCREDENTIAL)) { // Get the url regexp String urlDescription = node.getAttributeValue(WebcrawlerConfig.ATTR_URLREGEXP); try { Pattern p; try { p = Pattern.compile(urlDescription,Pattern.UNICODE_CASE); } catch (java.util.regex.PatternSyntaxException e) { throw new ManifoldCFException("Access credential regular expression '"+urlDescription+"' is illegal: "+e.getMessage(),e); } CredentialsItem ti = new CredentialsItem(p); String type = node.getAttributeValue(WebcrawlerConfig.ATTR_TYPE); // These get used in two of the three types; no harm in fetching them up front. String userName = node.getAttributeValue(WebcrawlerConfig.ATTR_USERNAME); String password = node.getAttributeValue(WebcrawlerConfig.ATTR_PASSWORD); if (password != null) password = ManifoldCF.deobfuscate(password); if (type.equals(WebcrawlerConfig.ATTRVALUE_BASIC)) ti.setCredential(new BasicCredential(userName,password)); else if (type.equals(WebcrawlerConfig.ATTRVALUE_NTLM)) { String domain = node.getAttributeValue(WebcrawlerConfig.ATTR_DOMAIN); ti.setCredential(new NTLMCredential(domain,userName,password)); } else if (type.equals(WebcrawlerConfig.ATTRVALUE_SESSION)) { // This is a complex credential type that cannot be easily set up with just a constructor. // Use the url regexp as the sequence key; this works as well as anything, although I haven't thought through all the implications if it gets changed. SessionCredential sc = new SessionCredential(urlDescription); // Loop through child nodes; they describe the pages that belong to the login sequence. int j = 0; while (j < node.getChildCount()) { ConfigNode child = node.getChild(j++); if (child.getType().equals(WebcrawlerConfig.NODE_AUTHPAGE)) { String authPageRegexp = child.getAttributeValue(WebcrawlerConfig.ATTR_URLREGEXP); String pageType = child.getAttributeValue(WebcrawlerConfig.ATTR_TYPE); String matchRegexp = child.getAttributeValue(WebcrawlerConfig.ATTR_MATCHREGEXP); String overrideTargetURL = child.getAttributeValue(WebcrawlerConfig.ATTR_OVERRIDETARGETURL); if (overrideTargetURL != null && overrideTargetURL.length() == 0) overrideTargetURL = null; Pattern authPattern; try { authPattern = Pattern.compile(authPageRegexp,Pattern.UNICODE_CASE); } catch (java.util.regex.PatternSyntaxException e) { throw new ManifoldCFException("Authentication page regular expression '"+authPageRegexp+"' is illegal: "+e.getMessage(),e); } Pattern matchPattern; try { matchPattern = Pattern.compile(matchRegexp,Pattern.UNICODE_CASE); } catch (java.util.regex.PatternSyntaxException e) { throw new ManifoldCFException("Match regular expression '"+matchRegexp+"' is illegal: "+e.getMessage(),e); } if (pageType.equals(WebcrawlerConfig.ATTRVALUE_FORM)) { sc.addAuthPage(authPageRegexp,authPattern,overrideTargetURL,null,null,matchRegexp,matchPattern,null,null,null,null); } else if (pageType.equals(WebcrawlerConfig.ATTRVALUE_LINK)) { sc.addAuthPage(authPageRegexp,authPattern,overrideTargetURL,matchRegexp,matchPattern,null,null,null,null,null,null); } else if (pageType.equals(WebcrawlerConfig.ATTRVALUE_REDIRECTION)) { sc.addAuthPage(authPageRegexp,authPattern,overrideTargetURL,null,null,null,null,matchRegexp,matchPattern,null,null); } else if (pageType.equals(WebcrawlerConfig.ATTRVALUE_CONTENT)) { sc.addAuthPage(authPageRegexp,authPattern,overrideTargetURL,null,null,null,null,null,null,matchRegexp,matchPattern); } else throw new ManifoldCFException("Invalid page type: "+pageType); // Finally, walk through any specified parameters int k = 0; while (k < child.getChildCount()) { ConfigNode paramNode = child.getChild(k++); if (paramNode.getType().equals(WebcrawlerConfig.NODE_AUTHPARAMETER)) { String paramName = paramNode.getAttributeValue(WebcrawlerConfig.ATTR_NAMEREGEXP); Pattern paramNamePattern; try { paramNamePattern = Pattern.compile(paramName,Pattern.UNICODE_CASE); } catch (java.util.regex.PatternSyntaxException e) { throw new ManifoldCFException("Parameter name regular expression '"+paramName+"' is illegal: "+e.getMessage(),e); } String passwordValue = paramNode.getAttributeValue(WebcrawlerConfig.ATTR_PASSWORD); String paramValue = paramNode.getAttributeValue(WebcrawlerConfig.ATTR_VALUE); if (passwordValue != null) paramValue = ManifoldCF.deobfuscate(passwordValue); sc.addPageParameter(authPageRegexp,paramName,paramNamePattern,paramValue); } } } } ti.setCredential(sc); } else throw new ManifoldCFException("Illegal credential type: "+type); patternHash.put(urlDescription,ti); } catch (PatternSyntaxException e) { throw new ManifoldCFException("Bad pattern syntax in '"+urlDescription+"'",e); } } } } /** Given a URL, find the right PageCredentials object to use. If more than one match is found, * use NEITHER object. */ public PageCredentials getPageCredential(String url) { PageCredentials c = null; Iterator iter = patternHash.keySet().iterator(); while (iter.hasNext()) { String urlDescription = (String)iter.next(); CredentialsItem ti = (CredentialsItem)patternHash.get(urlDescription); Pattern p = ti.getPattern(); AuthenticationCredentials ac = ti.getCredential(); if (ac instanceof PageCredentials) { Matcher m = p.matcher(url); if (m.find()) { if (c != null) return null; c = (PageCredentials)ac; } } } return c; } /** Given a URL, find the right SequenceCredentials object to use. If more than one match is found, * use NEITHER object. */ public SequenceCredentials getSequenceCredential(String url) { SequenceCredentials c = null; Iterator iter = patternHash.keySet().iterator(); while (iter.hasNext()) { String urlDescription = (String)iter.next(); CredentialsItem ti = (CredentialsItem)patternHash.get(urlDescription); Pattern p = ti.getPattern(); AuthenticationCredentials ac = ti.getCredential(); if (ac instanceof SequenceCredentials) { Matcher m = p.matcher(url); if (m.find()) { if (c != null) return null; c = (SequenceCredentials)ac; } } } return c; } /** Class representing an individual credential item. */ protected static class CredentialsItem { /** The bin-matching pattern. */ protected Pattern pattern; /** The credential */ protected AuthenticationCredentials authentication; /** Constructor. */ public CredentialsItem(Pattern p) { pattern = p; } /** Get the pattern. */ public Pattern getPattern() { return pattern; } /** Set Credentials */ public void setCredential(AuthenticationCredentials authentication) { this.authentication = authentication; } /** Get credential type */ public AuthenticationCredentials getCredential() { return authentication; } } /** Session credential parameter class */ protected static class SessionCredentialParameter { /** Name regexp */ protected String nameRegexp; /** Compiled name pattern */ protected Pattern namePattern; /** Value **/ protected String value; public SessionCredentialParameter(String nameRegexp, Pattern namePattern, String value) { this.nameRegexp = nameRegexp; this.namePattern = namePattern; this.value = value; } public Pattern getNamePattern() { return namePattern; } public String getValue() { return value; } public boolean equals(Object o) { if (!(o instanceof SessionCredentialParameter)) return false; SessionCredentialParameter sc = (SessionCredentialParameter)o; return nameRegexp.equals(sc.nameRegexp) && value.equals(sc.value); } public int hashCode() { return nameRegexp.hashCode() + value.hashCode(); } } /** Session credential helper class */ protected static class SessionCredentialItem implements LoginParameters { /** url regexp */ protected final String regexp; /** Url match pattern */ protected final Pattern pattern; /** Override target URL */ protected final String overrideTargetURL; /** The preferred redirection regexp */ protected final String preferredRedirectionRegexp; /** The preferred redirection pattern, or null if there's no preferred redirection */ protected final Pattern preferredRedirectionPattern; /** The preferred link regexp */ protected final String preferredLinkRegexp; /** The preferred link pattern, or null if there's no preferred link */ protected final Pattern preferredLinkPattern; /** The form name regexp */ protected final String formNameRegexp; /** The form name pattern, or null if no form is expected */ protected final Pattern formNamePattern; /** The content regexp */ protected final String contentRegexp; /** The content pattern, or null if no content is sought for */ protected final Pattern contentPattern; /** The list of the parameters we want to add for this pattern. */ protected final List parameters = new ArrayList(); /** Constructor */ public SessionCredentialItem(String regexp, Pattern p, String overrideTargetURL, String preferredLinkRegexp, Pattern preferredLinkPattern, String formNameRegexp, Pattern formNamePattern, String preferredRedirectionRegexp, Pattern preferredRedirectionPattern, String contentRegexp, Pattern contentPattern) { this.regexp = regexp; this.pattern = p; this.overrideTargetURL = overrideTargetURL; this.preferredLinkRegexp = preferredLinkRegexp; this.preferredLinkPattern = preferredLinkPattern; this.formNameRegexp = formNameRegexp; this.formNamePattern = formNamePattern; this.preferredRedirectionRegexp = preferredRedirectionRegexp; this.preferredRedirectionPattern = preferredRedirectionPattern; this.contentRegexp = contentRegexp; this.contentPattern = contentPattern; } /** Add parameter */ public void addParameter(String nameRegexp, Pattern namePattern, String value) { parameters.add(new SessionCredentialParameter(nameRegexp,namePattern,value)); } /** Get the pattern */ public Pattern getPattern() { return pattern; } /** Get the override target URL. */ public String getOverrideTargetURL() { return overrideTargetURL; } /** Get the preferred redirection pattern. */ public Pattern getPreferredRedirectionPattern() { return preferredRedirectionPattern; } /** Get the preferred link pattern. */ public Pattern getPreferredLinkPattern() { return preferredLinkPattern; } /** Get the form name pattern. */ public Pattern getFormNamePattern() { return formNamePattern; } /** Get the content pattern. */ public Pattern getContentPattern() { return contentPattern; } /** Get the name of the i'th parameter. */ public Pattern getParameterNamePattern(int index) { return getParameter(index).getNamePattern(); } /** Get the desired value of the i'th parameter. */ public String getParameterValue(int index) { return getParameter(index).getValue(); } /** Get the parameter count */ public int getParameterCount() { return parameters.size(); } /** Get the actual parameter */ public SessionCredentialParameter getParameter(int index) { return (SessionCredentialParameter)parameters.get(index); } public boolean equals(Object o) { if (!(o instanceof SessionCredentialItem)) return false; SessionCredentialItem sci = (SessionCredentialItem)o; if (!regexp.equals(sci.regexp)) return false; if (preferredRedirectionRegexp == null || sci.preferredRedirectionRegexp == null) { if (preferredRedirectionRegexp != sci.preferredRedirectionRegexp) return false; } else if (!preferredRedirectionRegexp.equals(sci.preferredRedirectionRegexp)) return false; if (preferredLinkRegexp == null || sci.preferredLinkRegexp == null) { if (preferredLinkRegexp != sci.preferredLinkRegexp) return false; } else if (!preferredLinkRegexp.equals(sci.preferredLinkRegexp)) return false; if (formNameRegexp == null || sci.formNameRegexp == null) { if (formNameRegexp != sci.formNameRegexp) return false; } else if (!formNameRegexp.equals(sci.formNameRegexp)) return false; if (contentRegexp == null || sci.contentRegexp == null) { if (contentRegexp != sci.contentRegexp) return false; } else if (!contentRegexp.equals(sci.contentRegexp)) return false; if (parameters.size() != sci.parameters.size()) return false; int i = 0; while (i < parameters.size()) { if (!((SessionCredentialParameter)parameters.get(i)).equals((SessionCredentialParameter)sci.parameters.get(i))) return false; i++; } return true; } public int hashCode() { int rval = regexp.hashCode() + ((preferredRedirectionRegexp==null)?0:preferredRedirectionRegexp.hashCode()) + ((preferredLinkRegexp==null)?0:preferredLinkRegexp.hashCode()) + ((formNameRegexp==null)?0:formNameRegexp.hashCode()) + ((contentRegexp==null)?0:contentRegexp.hashCode()); int i = 0; while (i < parameters.size()) { rval += parameters.get(i).hashCode(); i++; } return rval; } } /** LoginParameter iterator */ protected static class LoginParameterIterator implements Iterator { protected Map sessionPages; protected Iterator sessionPageIterator; protected String documentIdentifier; protected LoginParameters currentOne = null; /** Constructor */ public LoginParameterIterator(Map sessionPages, String documentIdentifier) { this.sessionPages = sessionPages; this.documentIdentifier = documentIdentifier; this.sessionPageIterator = sessionPages.keySet().iterator(); } /** Find next one */ protected void findNextOne() { if (currentOne != null) return; while (sessionPageIterator.hasNext()) { String key = (String)sessionPageIterator.next(); SessionCredentialItem sci = (SessionCredentialItem)sessionPages.get(key); Matcher m = sci.getPattern().matcher(documentIdentifier); if (m.find()) { currentOne = sci; return; } } } /** Check for next */ public boolean hasNext() { findNextOne(); return (currentOne != null); } /** Get the next one */ public Object next() { findNextOne(); Object rval = currentOne; currentOne = null; return rval; } public void remove() { throw new Error("Unimplemented function"); } } /** Session credentials */ protected static class SessionCredential implements SequenceCredentials { protected String sequenceKey; protected Map sessionPages = new HashMap(); /** Constructor */ public SessionCredential(String sequenceKey) { this.sequenceKey = sequenceKey; } /** Add an auth page */ public void addAuthPage(String urlregexp, Pattern urlPattern, String overrideTargetURL, String preferredLinkRegexp, Pattern preferredLinkPattern, String formNameRegexp, Pattern formNamePattern, String preferredRedirectionRegexp, Pattern preferredRedirectionPattern, String contentRegexp, Pattern contentPattern) throws ManifoldCFException { sessionPages.put(urlregexp,new SessionCredentialItem(urlregexp,urlPattern, overrideTargetURL, preferredLinkRegexp,preferredLinkPattern, formNameRegexp,formNamePattern, preferredRedirectionRegexp,preferredRedirectionPattern, contentRegexp,contentPattern)); } /** Add a page parameter */ public void addPageParameter(String urlregexp, String paramNameRegexp, Pattern paramNamePattern, String paramValue) { SessionCredentialItem sci = (SessionCredentialItem)sessionPages.get(urlregexp); sci.addParameter(paramNameRegexp,paramNamePattern,paramValue); } /** Fetch the unique key value for this particular credential. (This is used to enforce the proper page ordering). */ public String getSequenceKey() { return sequenceKey; } /** For a given login page, specific information may need to be submitted to the server to properly log in. This information * must be specified as part of the login sequence description information. * If null is returned, then this page has no specific login information. */ public Iterator findLoginParameters(String documentIdentifier) throws ManifoldCFException { return new LoginParameterIterator(sessionPages,documentIdentifier); } /** Compare against another object */ public boolean equals(Object o) { if (!(o instanceof SessionCredential)) return false; SessionCredential b = (SessionCredential)o; if (b.sessionPages.size() != sessionPages.size()) return false; Iterator iter = sessionPages.keySet().iterator(); while (iter.hasNext()) { String key = (String)iter.next(); SessionCredentialItem sci = (SessionCredentialItem)sessionPages.get(key); SessionCredentialItem bsci = (SessionCredentialItem)b.sessionPages.get(key); if (bsci == null) return false; if (!sci.equals(bsci)) return false; } return true; } /** Calculate a hash function */ public int hashCode() { int rval = 0; Iterator iter = sessionPages.keySet().iterator(); while (iter.hasNext()) { String key = (String)iter.next(); SessionCredentialItem sci = (SessionCredentialItem)sessionPages.get(key); rval += sci.hashCode(); } return rval; } } /** Basic type credentials */ protected static class BasicCredential implements PageCredentials { protected String userName; protected String password; protected UsernamePasswordCredentials credentialsObject; /** Constructor */ public BasicCredential(String userName, String password) { this.userName = userName; this.password = password; credentialsObject = new UsernamePasswordCredentials(userName,password); } /** Turn this instance into a Credentials object, given the specified target host name */ public Credentials makeCredentialsObject(String targetHostName) throws ManifoldCFException { return credentialsObject; } /** Compare against another object */ public boolean equals(Object o) { if (!(o instanceof BasicCredential)) return false; BasicCredential b = (BasicCredential)o; return b.userName.equals(userName) && b.password.equals(password); } /** Calculate a hash function */ public int hashCode() { return userName.hashCode() + password.hashCode(); } } /** NTLM-style credentials */ protected static class NTLMCredential implements PageCredentials { protected String domain; protected String userName; protected String password; // No Credentials object here because it depends on the hostname /** Constructor */ public NTLMCredential(String domain, String userName, String password) { this.domain = domain; this.userName = userName; this.password = password; } /** Turn this instance into a Credentials object, given the specified target host name */ public Credentials makeCredentialsObject(String targetHostName) throws ManifoldCFException { return new NTCredentials(userName,password,targetHostName,domain); } /** Compare against another object */ public boolean equals(Object o) { if (!(o instanceof NTLMCredential)) return false; NTLMCredential b = (NTLMCredential)o; return b.userName.equals(userName) && b.password.equals(password) && b.domain.equals(domain); } /** Calculate a hash function */ public int hashCode() { return userName.hashCode() + password.hashCode() + domain.hashCode(); } } }