/* $Id: CredentialsDescription.java 988245 2010-08-23 18:39:35Z kwright $ */
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.crawler.connectors.webcrawler;
import org.apache.manifoldcf.core.interfaces.*;
import org.apache.manifoldcf.crawler.system.ManifoldCF;
import java.util.*;
import java.util.regex.*;
import org.apache.http.auth.Credentials;
import org.apache.http.auth.NTCredentials;
import org.apache.http.auth.UsernamePasswordCredentials;
/** This class describes credential information pulled from a configuration.
* The data contained is organized by regular expression performed on a url. What we store
* for each regular expression is a Pattern, for efficiency.
*
* This structure deals with credentials as applied to a matching set of urls. It handles sequence-based
* credentials as well as page-based credentials - that is, session-type authentication descriptions as well
* as well as basic/ntlm authentication. (The two are in fact not mutually exclusive!!)
*
* For page-based credentials, a method is provided that locates the proper credential to use based on the page's url.
*
* For sequence-based credentials, a different method is provided. This reflects the fact that the underlying functionality
* of sequence-based credentials differs enormously from that of page-based.
*
* Generally it is a good thing to limit the number of regexps that need to be evaluated against
* any given url value as much as possible. For that reason I've organized this structure
* accordingly.
*/
public class CredentialsDescription
{
public static final String _rcsid = "@(#)$Id: CredentialsDescription.java 988245 2010-08-23 18:39:35Z kwright $";
/** This is the hash that contains everything. It's keyed by the regexp string itself.
* Values are CredentialsItem objects. */
protected HashMap patternHash = new HashMap();
/** Constructor. Build the description from the ConfigParams. */
public CredentialsDescription(ConfigParams configData)
throws ManifoldCFException
{
// Scan, looking for bin description nodes
int i = 0;
while (i < configData.getChildCount())
{
ConfigNode node = configData.getChild(i++);
if (node.getType().equals(WebcrawlerConfig.NODE_ACCESSCREDENTIAL))
{
// Get the url regexp
String urlDescription = node.getAttributeValue(WebcrawlerConfig.ATTR_URLREGEXP);
try
{
Pattern p;
try
{
p = Pattern.compile(urlDescription,Pattern.UNICODE_CASE);
}
catch (java.util.regex.PatternSyntaxException e)
{
throw new ManifoldCFException("Access credential regular expression '"+urlDescription+"' is illegal: "+e.getMessage(),e);
}
CredentialsItem ti = new CredentialsItem(p);
String type = node.getAttributeValue(WebcrawlerConfig.ATTR_TYPE);
// These get used in two of the three types; no harm in fetching them up front.
String userName = node.getAttributeValue(WebcrawlerConfig.ATTR_USERNAME);
String password = node.getAttributeValue(WebcrawlerConfig.ATTR_PASSWORD);
if (password != null)
password = ManifoldCF.deobfuscate(password);
if (type.equals(WebcrawlerConfig.ATTRVALUE_BASIC))
ti.setCredential(new BasicCredential(userName,password));
else if (type.equals(WebcrawlerConfig.ATTRVALUE_NTLM))
{
String domain = node.getAttributeValue(WebcrawlerConfig.ATTR_DOMAIN);
ti.setCredential(new NTLMCredential(domain,userName,password));
}
else if (type.equals(WebcrawlerConfig.ATTRVALUE_SESSION))
{
// This is a complex credential type that cannot be easily set up with just a constructor.
// Use the url regexp as the sequence key; this works as well as anything, although I haven't thought through all the implications if it gets changed.
SessionCredential sc = new SessionCredential(urlDescription);
// Loop through child nodes; they describe the pages that belong to the login sequence.
int j = 0;
while (j < node.getChildCount())
{
ConfigNode child = node.getChild(j++);
if (child.getType().equals(WebcrawlerConfig.NODE_AUTHPAGE))
{
String authPageRegexp = child.getAttributeValue(WebcrawlerConfig.ATTR_URLREGEXP);
String pageType = child.getAttributeValue(WebcrawlerConfig.ATTR_TYPE);
String matchRegexp = child.getAttributeValue(WebcrawlerConfig.ATTR_MATCHREGEXP);
String overrideTargetURL = child.getAttributeValue(WebcrawlerConfig.ATTR_OVERRIDETARGETURL);
if (overrideTargetURL != null && overrideTargetURL.length() == 0)
overrideTargetURL = null;
Pattern authPattern;
try
{
authPattern = Pattern.compile(authPageRegexp,Pattern.UNICODE_CASE);
}
catch (java.util.regex.PatternSyntaxException e)
{
throw new ManifoldCFException("Authentication page regular expression '"+authPageRegexp+"' is illegal: "+e.getMessage(),e);
}
Pattern matchPattern;
try
{
matchPattern = Pattern.compile(matchRegexp,Pattern.UNICODE_CASE);
}
catch (java.util.regex.PatternSyntaxException e)
{
throw new ManifoldCFException("Match regular expression '"+matchRegexp+"' is illegal: "+e.getMessage(),e);
}
if (pageType.equals(WebcrawlerConfig.ATTRVALUE_FORM))
{
sc.addAuthPage(authPageRegexp,authPattern,overrideTargetURL,null,null,matchRegexp,matchPattern,null,null,null,null);
}
else if (pageType.equals(WebcrawlerConfig.ATTRVALUE_LINK))
{
sc.addAuthPage(authPageRegexp,authPattern,overrideTargetURL,matchRegexp,matchPattern,null,null,null,null,null,null);
}
else if (pageType.equals(WebcrawlerConfig.ATTRVALUE_REDIRECTION))
{
sc.addAuthPage(authPageRegexp,authPattern,overrideTargetURL,null,null,null,null,matchRegexp,matchPattern,null,null);
}
else if (pageType.equals(WebcrawlerConfig.ATTRVALUE_CONTENT))
{
sc.addAuthPage(authPageRegexp,authPattern,overrideTargetURL,null,null,null,null,null,null,matchRegexp,matchPattern);
}
else
throw new ManifoldCFException("Invalid page type: "+pageType);
// Finally, walk through any specified parameters
int k = 0;
while (k < child.getChildCount())
{
ConfigNode paramNode = child.getChild(k++);
if (paramNode.getType().equals(WebcrawlerConfig.NODE_AUTHPARAMETER))
{
String paramName = paramNode.getAttributeValue(WebcrawlerConfig.ATTR_NAMEREGEXP);
Pattern paramNamePattern;
try
{
paramNamePattern = Pattern.compile(paramName,Pattern.UNICODE_CASE);
}
catch (java.util.regex.PatternSyntaxException e)
{
throw new ManifoldCFException("Parameter name regular expression '"+paramName+"' is illegal: "+e.getMessage(),e);
}
String passwordValue = paramNode.getAttributeValue(WebcrawlerConfig.ATTR_PASSWORD);
String paramValue = paramNode.getAttributeValue(WebcrawlerConfig.ATTR_VALUE);
if (passwordValue != null)
paramValue = ManifoldCF.deobfuscate(passwordValue);
sc.addPageParameter(authPageRegexp,paramName,paramNamePattern,paramValue);
}
}
}
}
ti.setCredential(sc);
}
else
throw new ManifoldCFException("Illegal credential type: "+type);
patternHash.put(urlDescription,ti);
}
catch (PatternSyntaxException e)
{
throw new ManifoldCFException("Bad pattern syntax in '"+urlDescription+"'",e);
}
}
}
}
/** Given a URL, find the right PageCredentials object to use. If more than one match is found,
* use NEITHER object.
*/
public PageCredentials getPageCredential(String url)
{
PageCredentials c = null;
Iterator iter = patternHash.keySet().iterator();
while (iter.hasNext())
{
String urlDescription = (String)iter.next();
CredentialsItem ti = (CredentialsItem)patternHash.get(urlDescription);
Pattern p = ti.getPattern();
AuthenticationCredentials ac = ti.getCredential();
if (ac instanceof PageCredentials)
{
Matcher m = p.matcher(url);
if (m.find())
{
if (c != null)
return null;
c = (PageCredentials)ac;
}
}
}
return c;
}
/** Given a URL, find the right SequenceCredentials object to use. If more than one match is found,
* use NEITHER object.
*/
public SequenceCredentials getSequenceCredential(String url)
{
SequenceCredentials c = null;
Iterator iter = patternHash.keySet().iterator();
while (iter.hasNext())
{
String urlDescription = (String)iter.next();
CredentialsItem ti = (CredentialsItem)patternHash.get(urlDescription);
Pattern p = ti.getPattern();
AuthenticationCredentials ac = ti.getCredential();
if (ac instanceof SequenceCredentials)
{
Matcher m = p.matcher(url);
if (m.find())
{
if (c != null)
return null;
c = (SequenceCredentials)ac;
}
}
}
return c;
}
/** Class representing an individual credential item.
*/
protected static class CredentialsItem
{
/** The bin-matching pattern. */
protected Pattern pattern;
/** The credential */
protected AuthenticationCredentials authentication;
/** Constructor. */
public CredentialsItem(Pattern p)
{
pattern = p;
}
/** Get the pattern. */
public Pattern getPattern()
{
return pattern;
}
/** Set Credentials */
public void setCredential(AuthenticationCredentials authentication)
{
this.authentication = authentication;
}
/** Get credential type */
public AuthenticationCredentials getCredential()
{
return authentication;
}
}
/** Session credential parameter class */
protected static class SessionCredentialParameter
{
/** Name regexp */
protected String nameRegexp;
/** Compiled name pattern */
protected Pattern namePattern;
/** Value **/
protected String value;
public SessionCredentialParameter(String nameRegexp, Pattern namePattern, String value)
{
this.nameRegexp = nameRegexp;
this.namePattern = namePattern;
this.value = value;
}
public Pattern getNamePattern()
{
return namePattern;
}
public String getValue()
{
return value;
}
public boolean equals(Object o)
{
if (!(o instanceof SessionCredentialParameter))
return false;
SessionCredentialParameter sc = (SessionCredentialParameter)o;
return nameRegexp.equals(sc.nameRegexp) && value.equals(sc.value);
}
public int hashCode()
{
return nameRegexp.hashCode() + value.hashCode();
}
}
/** Session credential helper class */
protected static class SessionCredentialItem implements LoginParameters
{
/** url regexp */
protected final String regexp;
/** Url match pattern */
protected final Pattern pattern;
/** Override target URL */
protected final String overrideTargetURL;
/** The preferred redirection regexp */
protected final String preferredRedirectionRegexp;
/** The preferred redirection pattern, or null if there's no preferred redirection */
protected final Pattern preferredRedirectionPattern;
/** The preferred link regexp */
protected final String preferredLinkRegexp;
/** The preferred link pattern, or null if there's no preferred link */
protected final Pattern preferredLinkPattern;
/** The form name regexp */
protected final String formNameRegexp;
/** The form name pattern, or null if no form is expected */
protected final Pattern formNamePattern;
/** The content regexp */
protected final String contentRegexp;
/** The content pattern, or null if no content is sought for */
protected final Pattern contentPattern;
/** The list of the parameters we want to add for this pattern. */
protected final List parameters = new ArrayList();
/** Constructor */
public SessionCredentialItem(String regexp, Pattern p,
String overrideTargetURL,
String preferredLinkRegexp, Pattern preferredLinkPattern,
String formNameRegexp, Pattern formNamePattern,
String preferredRedirectionRegexp, Pattern preferredRedirectionPattern,
String contentRegexp, Pattern contentPattern)
{
this.regexp = regexp;
this.pattern = p;
this.overrideTargetURL = overrideTargetURL;
this.preferredLinkRegexp = preferredLinkRegexp;
this.preferredLinkPattern = preferredLinkPattern;
this.formNameRegexp = formNameRegexp;
this.formNamePattern = formNamePattern;
this.preferredRedirectionRegexp = preferredRedirectionRegexp;
this.preferredRedirectionPattern = preferredRedirectionPattern;
this.contentRegexp = contentRegexp;
this.contentPattern = contentPattern;
}
/** Add parameter */
public void addParameter(String nameRegexp, Pattern namePattern, String value)
{
parameters.add(new SessionCredentialParameter(nameRegexp,namePattern,value));
}
/** Get the pattern */
public Pattern getPattern()
{
return pattern;
}
/** Get the override target URL.
*/
public String getOverrideTargetURL()
{
return overrideTargetURL;
}
/** Get the preferred redirection pattern.
*/
public Pattern getPreferredRedirectionPattern()
{
return preferredRedirectionPattern;
}
/** Get the preferred link pattern.
*/
public Pattern getPreferredLinkPattern()
{
return preferredLinkPattern;
}
/** Get the form name pattern.
*/
public Pattern getFormNamePattern()
{
return formNamePattern;
}
/** Get the content pattern.
*/
public Pattern getContentPattern()
{
return contentPattern;
}
/** Get the name of the i'th parameter.
*/
public Pattern getParameterNamePattern(int index)
{
return getParameter(index).getNamePattern();
}
/** Get the desired value of the i'th parameter.
*/
public String getParameterValue(int index)
{
return getParameter(index).getValue();
}
/** Get the parameter count */
public int getParameterCount()
{
return parameters.size();
}
/** Get the actual parameter */
public SessionCredentialParameter getParameter(int index)
{
return (SessionCredentialParameter)parameters.get(index);
}
public boolean equals(Object o)
{
if (!(o instanceof SessionCredentialItem))
return false;
SessionCredentialItem sci = (SessionCredentialItem)o;
if (!regexp.equals(sci.regexp))
return false;
if (preferredRedirectionRegexp == null || sci.preferredRedirectionRegexp == null)
{
if (preferredRedirectionRegexp != sci.preferredRedirectionRegexp)
return false;
}
else if (!preferredRedirectionRegexp.equals(sci.preferredRedirectionRegexp))
return false;
if (preferredLinkRegexp == null || sci.preferredLinkRegexp == null)
{
if (preferredLinkRegexp != sci.preferredLinkRegexp)
return false;
}
else if (!preferredLinkRegexp.equals(sci.preferredLinkRegexp))
return false;
if (formNameRegexp == null || sci.formNameRegexp == null)
{
if (formNameRegexp != sci.formNameRegexp)
return false;
}
else if (!formNameRegexp.equals(sci.formNameRegexp))
return false;
if (contentRegexp == null || sci.contentRegexp == null)
{
if (contentRegexp != sci.contentRegexp)
return false;
}
else if (!contentRegexp.equals(sci.contentRegexp))
return false;
if (parameters.size() != sci.parameters.size())
return false;
int i = 0;
while (i < parameters.size())
{
if (!((SessionCredentialParameter)parameters.get(i)).equals((SessionCredentialParameter)sci.parameters.get(i)))
return false;
i++;
}
return true;
}
public int hashCode()
{
int rval = regexp.hashCode() + ((preferredRedirectionRegexp==null)?0:preferredRedirectionRegexp.hashCode()) +
((preferredLinkRegexp==null)?0:preferredLinkRegexp.hashCode()) +
((formNameRegexp==null)?0:formNameRegexp.hashCode()) +
((contentRegexp==null)?0:contentRegexp.hashCode());
int i = 0;
while (i < parameters.size())
{
rval += parameters.get(i).hashCode();
i++;
}
return rval;
}
}
/** LoginParameter iterator */
protected static class LoginParameterIterator implements Iterator
{
protected Map sessionPages;
protected Iterator sessionPageIterator;
protected String documentIdentifier;
protected LoginParameters currentOne = null;
/** Constructor */
public LoginParameterIterator(Map sessionPages, String documentIdentifier)
{
this.sessionPages = sessionPages;
this.documentIdentifier = documentIdentifier;
this.sessionPageIterator = sessionPages.keySet().iterator();
}
/** Find next one */
protected void findNextOne()
{
if (currentOne != null)
return;
while (sessionPageIterator.hasNext())
{
String key = (String)sessionPageIterator.next();
SessionCredentialItem sci = (SessionCredentialItem)sessionPages.get(key);
Matcher m = sci.getPattern().matcher(documentIdentifier);
if (m.find())
{
currentOne = sci;
return;
}
}
}
/** Check for next */
public boolean hasNext()
{
findNextOne();
return (currentOne != null);
}
/** Get the next one */
public Object next()
{
findNextOne();
Object rval = currentOne;
currentOne = null;
return rval;
}
public void remove()
{
throw new Error("Unimplemented function");
}
}
/** Session credentials */
protected static class SessionCredential implements SequenceCredentials
{
protected String sequenceKey;
protected Map sessionPages = new HashMap();
/** Constructor */
public SessionCredential(String sequenceKey)
{
this.sequenceKey = sequenceKey;
}
/** Add an auth page */
public void addAuthPage(String urlregexp, Pattern urlPattern,
String overrideTargetURL,
String preferredLinkRegexp, Pattern preferredLinkPattern,
String formNameRegexp, Pattern formNamePattern,
String preferredRedirectionRegexp, Pattern preferredRedirectionPattern,
String contentRegexp, Pattern contentPattern)
throws ManifoldCFException
{
sessionPages.put(urlregexp,new SessionCredentialItem(urlregexp,urlPattern,
overrideTargetURL,
preferredLinkRegexp,preferredLinkPattern,
formNameRegexp,formNamePattern,
preferredRedirectionRegexp,preferredRedirectionPattern,
contentRegexp,contentPattern));
}
/** Add a page parameter */
public void addPageParameter(String urlregexp, String paramNameRegexp, Pattern paramNamePattern, String paramValue)
{
SessionCredentialItem sci = (SessionCredentialItem)sessionPages.get(urlregexp);
sci.addParameter(paramNameRegexp,paramNamePattern,paramValue);
}
/** Fetch the unique key value for this particular credential. (This is used to enforce the proper page ordering).
*/
public String getSequenceKey()
{
return sequenceKey;
}
/** For a given login page, specific information may need to be submitted to the server to properly log in. This information
* must be specified as part of the login sequence description information.
* If null is returned, then this page has no specific login information.
*/
public Iterator findLoginParameters(String documentIdentifier)
throws ManifoldCFException
{
return new LoginParameterIterator(sessionPages,documentIdentifier);
}
/** Compare against another object */
public boolean equals(Object o)
{
if (!(o instanceof SessionCredential))
return false;
SessionCredential b = (SessionCredential)o;
if (b.sessionPages.size() != sessionPages.size())
return false;
Iterator iter = sessionPages.keySet().iterator();
while (iter.hasNext())
{
String key = (String)iter.next();
SessionCredentialItem sci = (SessionCredentialItem)sessionPages.get(key);
SessionCredentialItem bsci = (SessionCredentialItem)b.sessionPages.get(key);
if (bsci == null)
return false;
if (!sci.equals(bsci))
return false;
}
return true;
}
/** Calculate a hash function */
public int hashCode()
{
int rval = 0;
Iterator iter = sessionPages.keySet().iterator();
while (iter.hasNext())
{
String key = (String)iter.next();
SessionCredentialItem sci = (SessionCredentialItem)sessionPages.get(key);
rval += sci.hashCode();
}
return rval;
}
}
/** Basic type credentials */
protected static class BasicCredential implements PageCredentials
{
protected String userName;
protected String password;
protected UsernamePasswordCredentials credentialsObject;
/** Constructor */
public BasicCredential(String userName, String password)
{
this.userName = userName;
this.password = password;
credentialsObject = new UsernamePasswordCredentials(userName,password);
}
/** Turn this instance into a Credentials object, given the specified target host name */
public Credentials makeCredentialsObject(String targetHostName)
throws ManifoldCFException
{
return credentialsObject;
}
/** Compare against another object */
public boolean equals(Object o)
{
if (!(o instanceof BasicCredential))
return false;
BasicCredential b = (BasicCredential)o;
return b.userName.equals(userName) && b.password.equals(password);
}
/** Calculate a hash function */
public int hashCode()
{
return userName.hashCode() + password.hashCode();
}
}
/** NTLM-style credentials */
protected static class NTLMCredential implements PageCredentials
{
protected String domain;
protected String userName;
protected String password;
// No Credentials object here because it depends on the hostname
/** Constructor */
public NTLMCredential(String domain, String userName, String password)
{
this.domain = domain;
this.userName = userName;
this.password = password;
}
/** Turn this instance into a Credentials object, given the specified target host name */
public Credentials makeCredentialsObject(String targetHostName)
throws ManifoldCFException
{
return new NTCredentials(userName,password,targetHostName,domain);
}
/** Compare against another object */
public boolean equals(Object o)
{
if (!(o instanceof NTLMCredential))
return false;
NTLMCredential b = (NTLMCredential)o;
return b.userName.equals(userName) && b.password.equals(password) &&
b.domain.equals(domain);
}
/** Calculate a hash function */
public int hashCode()
{
return userName.hashCode() + password.hashCode() + domain.hashCode();
}
}
}