/* $Id: FileConnector.java 995085 2010-09-08 15:13:38Z kwright $ */
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.crawler.connectors.filesystem;
import org.apache.manifoldcf.core.interfaces.*;
import org.apache.manifoldcf.agents.interfaces.*;
import org.apache.manifoldcf.crawler.interfaces.*;
import org.apache.manifoldcf.crawler.system.Logging;
import org.apache.manifoldcf.connectorcommon.extmimemap.ExtensionMimeMap;
import java.util.*;
import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
/** This is the "repository connector" for a file system. It's a relative of the share crawler, and should have
* comparable basic functionality, with the exception of the ability to use ActiveDirectory and look at other shares.
*/
public class FileConnector extends org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnector
{
public static final String _rcsid = "@(#)$Id: FileConnector.java 995085 2010-09-08 15:13:38Z kwright $";
// Activities that we know about
protected final static String ACTIVITY_READ = "read document";
// Relationships we know about
protected static final String RELATIONSHIP_CHILD = "child";
// Activities list
protected static final String[] activitiesList = new String[]{ACTIVITY_READ};
// Parameters that this connector cares about
// public final static String ROOTDIRECTORY = "rootdirectory";
// Local data
// protected File rootDirectory = null;
/** Constructor.
*/
public FileConnector()
{
}
/** Tell the world what model this connector uses for getDocumentIdentifiers().
* This must return a model value as specified above.
*@return the model type value.
*/
@Override
public int getConnectorModel()
{
return MODEL_CHAINED_ADD_CHANGE;
}
/** Return the list of relationship types that this connector recognizes.
*@return the list.
*/
@Override
public String[] getRelationshipTypes()
{
return new String[]{RELATIONSHIP_CHILD};
}
/** List the activities we might report on.
*/
@Override
public String[] getActivitiesList()
{
return activitiesList;
}
/** For any given document, list the bins that it is a member of.
*/
@Override
public String[] getBinNames(String documentIdentifier)
{
/*
// Note: This code is for testing, so we can see how documents behave when they are in various kinds of bin situations.
// The testing model is that there are documents belonging to "SLOW", to "FAST", or both to "SLOW" and "FAST" bins.
// The connector chooses which bins to assign a document to based on the identifier (which is the document's path), so
// this is something that should NOT be duplicated by other connector implementers.
if (documentIdentifier.indexOf("/BOTH/") != -1 || (documentIdentifier.indexOf("/SLOW/") != -1 && documentIdentifier.indexOf("/FAST/") != -1))
return new String[]{"SLOW","FAST"};
if (documentIdentifier.indexOf("/SLOW/") != -1)
return new String[]{"SLOW"};
if (documentIdentifier.indexOf("/FAST/") != -1)
return new String[]{"FAST"};
*/
return new String[]{""};
}
/** Convert a document identifier to a URI. The URI is the URI that will be the unique key from
* the search index, and will be presented to the user as part of the search results.
*@param filePath is the document filePath.
*@param repositoryPath is the document repositoryPath.
*@return the document uri.
*/
protected static String convertToWGETURI(String path)
throws ManifoldCFException
{
//
// Note well: This MUST be a legal URI!!!
try
{
StringBuffer sb = new StringBuffer();
String[] tmp = path.split("/", 3);
String scheme = "";
String host = "";
String other = "";
if (tmp.length >= 1)
scheme = tmp[0];
else
scheme = "http";
if (tmp.length >= 2)
host = tmp[1];
else
host = "localhost";
if (tmp.length >= 3)
other = "/" + tmp[2];
else
other = "/";
return new URI(scheme + "://" + host + other).toURL().toString();
}
catch (java.net.MalformedURLException e)
{
throw new ManifoldCFException("Bad url: "+e.getMessage(),e);
}
catch (URISyntaxException e)
{
throw new ManifoldCFException("Bad url: "+e.getMessage(),e);
}
}
/** Convert a document identifier to a URI. The URI is the URI that will be the unique key from
* the search index, and will be presented to the user as part of the search results.
*@param documentIdentifier is the document identifier.
*@return the document uri.
*/
protected static String convertToURI(String documentIdentifier)
throws ManifoldCFException
{
//
// Note well: This MUST be a legal URI!!!
try
{
return new File(documentIdentifier).toURI().toURL().toString();
}
catch (java.io.IOException e)
{
throw new ManifoldCFException("Bad url",e);
}
}
/** Queue "seed" documents. Seed documents are the starting places for crawling activity. Documents
* are seeded when this method calls appropriate methods in the passed in ISeedingActivity object.
*
* This method can choose to find repository changes that happen only during the specified time interval.
* The seeds recorded by this method will be viewed by the framework based on what the
* getConnectorModel() method returns.
*
* It is not a big problem if the connector chooses to create more seeds than are
* strictly necessary; it is merely a question of overall work required.
*
* The end time and seeding version string passed to this method may be interpreted for greatest efficiency.
* For continuous crawling jobs, this method will
* be called once, when the job starts, and at various periodic intervals as the job executes.
*
* When a job's specification is changed, the framework automatically resets the seeding version string to null. The
* seeding version string may also be set to null on each job run, depending on the connector model returned by
* getConnectorModel().
*
* Note that it is always ok to send MORE documents rather than less to this method.
* The connector will be connected before this method can be called.
*@param activities is the interface this method should use to perform whatever framework actions are desired.
*@param spec is a document specification (that comes from the job).
*@param seedTime is the end of the time range of documents to consider, exclusive.
*@param lastSeedVersionString is the last seeding version string for this job, or null if the job has no previous seeding version string.
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
*@return an updated seeding version string, to be stored with the job.
*/
@Override
public String addSeedDocuments(ISeedingActivity activities, Specification spec,
String lastSeedVersion, long seedTime, int jobMode)
throws ManifoldCFException, ServiceInterruption
{
try
{
// Walk the specification for the "startpoint" types. Amalgamate these into a list of strings.
// Presume that all roots are startpoint nodes
for (int i = 0; i < spec.getChildCount(); i++)
{
SpecificationNode n = spec.getChild(i);
if (n.getType().equals("startpoint"))
{
// The id returned MUST be in canonical form!!!
String seed = new File(n.getAttributeValue("path")).getCanonicalPath();
if (Logging.connectors.isDebugEnabled())
{
Logging.connectors.debug("Seed = '"+seed+"'");
}
activities.addSeedDocument(seed);
}
}
}
catch (IOException e)
{
throw new ManifoldCFException("Could not get a canonical path: "+e.getMessage(),e);
}
return "";
}
/** Process a set of documents.
* This is the method that should cause each document to be fetched, processed, and the results either added
* to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
* The document specification allows this class to filter what is done based on the job.
* The connector will be connected before this method can be called.
*@param documentIdentifiers is the set of document identifiers to process.
*@param statuses are the currently-stored document versions for each document in the set of document identifiers
* passed in above.
*@param activities is the interface this method should use to queue up new document references
* and ingest documents.
*@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
*@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
*/
@Override
public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
throws ManifoldCFException, ServiceInterruption
{
for (String documentIdentifier : documentIdentifiers)
{
File file = new File(documentIdentifier);
if (!file.exists())
{
activities.deleteDocument(documentIdentifier);
continue;
}
if (file.isDirectory())
{
// It's a directory. The version ID would be the
// last modified date, except that doesn't work on Windows
// because modified dates are not transitive.
//long lastModified = file.lastModified();
//rval[i] = new Long(lastModified).toString();
// No versioning; just reference children
// Chained connectors scan parent nodes always
// Queue up stuff for directory
long startTime = System.currentTimeMillis();
String errorCode = null;
String errorDesc = null;
try
{
try
{
File[] files = file.listFiles();
if (files != null)
{
for (File f : files)
{
String canonicalPath = f.getCanonicalPath();
if (checkInclude(f,canonicalPath,spec))
activities.addDocumentReference(canonicalPath,documentIdentifier,RELATIONSHIP_CHILD);
}
}
errorCode = "OK";
}
catch (IOException e)
{
errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
errorDesc = e.getMessage();
throw new ManifoldCFException("IO exception: "+e.getMessage(),e);
}
}
finally
{
if (errorCode != null)
activities.recordActivity(new Long(startTime),ACTIVITY_READ,null,documentIdentifier,errorCode,errorDesc,null);
}
continue;
}
// It's a file
String versionString;
String convertPath;
long fileLength = file.length();
// Get the file's modified date.
long lastModified = file.lastModified();
// Check if the path is to be converted. We record that info in the version string so that we'll reindex documents whose
// URI's change.
convertPath = findConvertPath(spec, file);
StringBuilder sb = new StringBuilder();
if (convertPath != null)
{
// Record the path.
sb.append("+");
pack(sb,convertPath,'+');
}
else
sb.append("-");
sb.append(new Long(lastModified).toString()).append(":").append(new Long(fileLength).toString());
versionString = sb.toString();
if (!activities.checkDocumentNeedsReindexing(documentIdentifier,versionString))
continue;
long startTime = System.currentTimeMillis();
String errorCode = null;
String errorDesc = null;
Long fileLengthLong = null;
try
{
// We've already avoided queuing documents that we don't want, based on file specifications.
// We still need to check based on file data.
if (!checkIngest(file,spec))
{
activities.noDocument(documentIdentifier,versionString);
continue;
}
String fileName = file.getName();
Date modifiedDate = new Date(file.lastModified());
String mimeType = mapExtensionToMimeType(fileName);
String uri;
if (convertPath != null) {
// WGET-compatible input; convert back to external URI
uri = convertToWGETURI(convertPath);
} else {
uri = convertToURI(documentIdentifier);
}
if (!activities.checkLengthIndexable(fileLength))
{
errorCode = activities.EXCLUDED_LENGTH;
errorDesc = "Excluded because of length ("+fileLength+")";
Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because length was excluded by output connector.");
activities.noDocument(documentIdentifier,versionString);
continue;
}
if (!activities.checkURLIndexable(uri))
{
errorCode = activities.EXCLUDED_URL;
errorDesc = "Excluded because of URL ('"+uri+"')";
Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because URL was excluded by output connector.");
activities.noDocument(documentIdentifier,versionString);
continue;
}
if (!activities.checkDateIndexable(modifiedDate))
{
errorCode = activities.EXCLUDED_DATE;
errorDesc = "Excluded because of date ("+modifiedDate+")";
Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because date ("+modifiedDate+") was excluded by output connector.");
activities.noDocument(documentIdentifier,versionString);
continue;
}
if (!activities.checkMimeTypeIndexable(mimeType))
{
errorCode = activities.EXCLUDED_MIMETYPE;
errorDesc = "Excluded because mime type ('"+mimeType+"')";
Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because mime type ('"+mimeType+"') was excluded by output connector.");
activities.noDocument(documentIdentifier,versionString);
continue;
}
RepositoryDocument data = new RepositoryDocument();
data.setFileName(fileName);
data.setMimeType(mimeType);
data.setModifiedDate(modifiedDate);
if (convertPath != null) {
// WGET-compatible input; convert back to external URI
data.addField("uri",uri);
} else {
data.addField("uri",file.toString());
}
// MHL for other metadata
// Ingest the document.
try
{
InputStream is = new FileInputStream(file);
try
{
data.setBinary(is,fileLength);
activities.ingestDocumentWithException(documentIdentifier,versionString,uri,data);
errorCode = "OK";
fileLengthLong = new Long(fileLength);
}
finally
{
is.close();
}
}
catch (FileNotFoundException e)
{
//skip. throw nothing.
Logging.connectors.debug("Skipping file due to " +e.getMessage());
activities.noDocument(documentIdentifier,versionString);
continue;
}
catch (InterruptedIOException e)
{
throw new ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
catch (IOException e)
{
errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
errorDesc = e.getMessage();
throw new ManifoldCFException("IO Error: "+e.getMessage(),e);
}
}
catch (ManifoldCFException e)
{
if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
errorCode = null;
throw e;
}
finally
{
if (errorCode != null)
activities.recordActivity(new Long(startTime),ACTIVITY_READ,fileLengthLong,documentIdentifier,errorCode,errorDesc,null);
}
}
}
/** This method finds the part of the path that should be converted to a URI.
* Returns null if the path should not be converted.
*@param spec is the document specification.
*@param documentIdentifier is the document identifier.
*@return the part of the path to be converted, or null.
*/
protected static String findConvertPath(Specification spec, File theFile)
{
String fullpath = theFile.getAbsolutePath().replaceAll("\\\\","/");
for (int j = 0; j < spec.getChildCount(); j++)
{
SpecificationNode sn = spec.getChild(j);
if (sn.getType().equals("startpoint"))
{
String path = sn.getAttributeValue("path").replaceAll("\\\\","/");
String convertToURI = sn.getAttributeValue("converttouri");
if (path.length() > 0 && convertToURI != null && convertToURI.equals("true"))
{
if (!path.endsWith("/"))
path += "/";
if (fullpath.startsWith(path))
return fullpath.substring(path.length());
}
}
}
return null;
}
/** Map an extension to a mime type */
protected static String mapExtensionToMimeType(String fileName)
{
int slashIndex = fileName.lastIndexOf("/");
if (slashIndex != -1)
fileName = fileName.substring(slashIndex+1);
int dotIndex = fileName.lastIndexOf(".");
if (dotIndex == -1)
return null;
return ExtensionMimeMap.mapToMimeType(fileName.substring(dotIndex+1).toLowerCase(java.util.Locale.ROOT));
}
// UI support methods.
//
// These support methods come in two varieties. The first bunch is involved in setting up connection configuration information. The second bunch
// is involved in presenting and editing document specification information for a job. The two kinds of methods are accordingly treated differently,
// in that the first bunch cannot assume that the current connector object is connected, while the second bunch can. That is why the first bunch
// receives a thread context argument for all UI methods, while the second bunch does not need one (since it has already been applied via the connect()
// method, above).
/** Output the specification header section.
* This method is called in the head section of a job page which has selected a repository connection of the
* current type. Its purpose is to add the required tabs to the list, and to output any javascript methods
* that might be needed by the job editing HTML.
* The connector will be connected before this method can be called.
*@param out is the output to which any HTML should be sent.
*@param locale is the locale the output is preferred to be in.
*@param ds is the current document specification for this job.
*@param connectionSequenceNumber is the unique number of this connection within the job.
*@param tabsArray is an array of tab names. Add to this array any tab names that are specific to the connector.
*/
@Override
public void outputSpecificationHeader(IHTTPOutput out, Locale locale, Specification ds,
int connectionSequenceNumber, List<String> tabsArray)
throws ManifoldCFException, IOException
{
tabsArray.add(Messages.getString(locale,"FileConnector.Paths"));
String seqPrefix = "s"+connectionSequenceNumber+"_";
out.print(
"<script type=\"text/javascript\">\n"+
"<!--\n"+
"\n"+
"function "+seqPrefix+"SpecOp(n, opValue, anchorvalue)\n"+
"{\n"+
" eval(\"editjob.\"+n+\".value = \\\"\"+opValue+\"\\\"\");\n"+
" postFormSetAnchor(anchorvalue);\n"+
"}\n"+
"//-->\n"+
"</script>\n"
);
}
/** Output the specification body section.
* This method is called in the body section of a job page which has selected a repository connection of the
* current type. Its purpose is to present the required form elements for editing.
* The coder can presume that the HTML that is output from this configuration will be within appropriate
* <html>, <body>, and <form> tags. The name of the form is always "editjob".
* The connector will be connected before this method can be called.
*@param out is the output to which any HTML should be sent.
*@param locale is the locale the output is preferred to be in.
*@param ds is the current document specification for this job.
*@param connectionSequenceNumber is the unique number of this connection within the job.
*@param actualSequenceNumber is the connection within the job that has currently been selected.
*@param tabName is the current tab name. (actualSequenceNumber, tabName) form a unique tuple within
* the job.
*/
@Override
public void outputSpecificationBody(IHTTPOutput out, Locale locale, Specification ds,
int connectionSequenceNumber, int actualSequenceNumber, String tabName)
throws ManifoldCFException, IOException
{
String seqPrefix = "s"+connectionSequenceNumber+"_";
int i;
int k;
// Paths tab
if (tabName.equals(Messages.getString(locale,"FileConnector.Paths")) && connectionSequenceNumber == actualSequenceNumber)
{
out.print(
"<table class=\"displaytable\">\n"+
" <tr><td class=\"separator\" colspan=\"3\"><hr/></td></tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"FileConnector.Paths2") + "</nobr></td>\n"+
" <td class=\"boxcell\">\n"+
" <table class=\"formtable\">\n"+
" <tr class=\"formheaderrow\">\n"+
" <td class=\"formcolumnheader\"></td>\n"+
" <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale,"FileConnector.RootPath") + "</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale,"FileConnector.ConvertToURI") + "<br/>" + Messages.getBodyString(locale,"FileConnector.ConvertToURIExample")+ "</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale,"FileConnector.Rules") + "</nobr></td>\n"+
" </tr>\n"
);
i = 0;
k = 0;
while (i < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(i++);
if (sn.getType().equals("startpoint"))
{
String pathDescription = "_"+Integer.toString(k);
String pathOpName = seqPrefix+"specop"+pathDescription;
String path = sn.getAttributeValue("path");
String convertToURIString = sn.getAttributeValue("converttouri");
boolean convertToURI = false;
if (convertToURIString != null && convertToURIString.equals("true"))
convertToURI = true;
out.print(
" <tr class=\""+(((k % 2)==0)?"evenformrow":"oddformrow")+"\">\n"+
" <td class=\"formcolumncell\">\n"+
" <input type=\"hidden\" name=\""+pathOpName+"\" value=\"\"/>\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"specpath"+pathDescription+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(sn.getAttributeValue("path"))+"\"/>\n"+
" <a name=\""+seqPrefix+"path_"+Integer.toString(k)+"\">\n"+
" <input type=\"button\" value=\"" + Messages.getAttributeString(locale,"FileConnector.Delete") + "\" onClick='Javascript:"+seqPrefix+"SpecOp(\""+pathOpName+"\",\"Delete\",\""+seqPrefix+"path_"+Integer.toString(k)+"\")' alt=\""+Messages.getAttributeString(locale,"FileConnector.DeletePath")+Integer.toString(k)+"\"/>\n"+
" </a>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" "+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(path)+" \n"+
" </nobr>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"converttouri"+pathDescription+"\" value=\""+(convertToURI?"true":"false")+"\">\n"+
" <nobr>\n"+
" "+(convertToURI?Messages.getBodyString(locale,"FileConnector.Yes"):Messages.getBodyString(locale,"FileConnector.No"))+" \n"+
" </nobr>\n"+
" </td>\n"+
" <td class=\"boxcell\">\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"specchildcount"+pathDescription+"\" value=\""+Integer.toString(sn.getChildCount())+"\"/>\n"+
" <table class=\"formtable\">\n"+
" <tr class=\"formheaderrow\">\n"+
" <td class=\"formcolumnheader\"></td>\n"+
" <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale,"FileConnector.IncludeExclude") + "</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale,"FileConnector.FileDirectory") + "</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale,"FileConnector.Match") + "</nobr></td>\n"+
" </tr>\n"
);
int j = 0;
while (j < sn.getChildCount())
{
SpecificationNode excludeNode = sn.getChild(j);
String instanceDescription = "_"+Integer.toString(k)+"_"+Integer.toString(j);
String instanceOpName = seqPrefix + "specop" + instanceDescription;
String nodeFlavor = excludeNode.getType();
String nodeType = excludeNode.getAttributeValue("type");
String nodeMatch = excludeNode.getAttributeValue("match");
out.print(
" <tr class=\"evenformrow\">\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" <input type=\"button\" value=\"" + Messages.getAttributeString(locale,"FileConnector.InsertHere") + "\" onClick='Javascript:"+seqPrefix+"SpecOp(\""+instanceOpName+"\",\"Insert Here\",\""+seqPrefix+"match_"+Integer.toString(k)+"_"+Integer.toString(j+1)+"\")' alt=\""+Messages.getAttributeString(locale,"FileConnector.InsertNewMatchForPath")+Integer.toString(k)+" before position #"+Integer.toString(j)+"\"/>\n"+
" </nobr>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" <select name=\""+seqPrefix+"specflavor"+instanceDescription+"\">\n"+
" <option value=\"include\">" + Messages.getBodyString(locale,"FileConnector.include") + "</option>\n"+
" <option value=\"exclude\">" + Messages.getBodyString(locale,"FileConnector.exclude") + "</option>\n"+
" </select>\n"+
" </nobr>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" <select name=\""+seqPrefix+"spectype"+instanceDescription+"\">\n"+
" <option value=\"file\">" + Messages.getBodyString(locale,"FileConnector.File") + "</option>\n"+
" <option value=\"directory\">" + Messages.getBodyString(locale,"FileConnector.Directory") + "</option>\n"+
" </select>\n"+
" </nobr>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" <input type=\"text\" size=\"10\" name=\""+seqPrefix+"specmatch"+instanceDescription+"\" value=\"\"/>\n"+
" </nobr>\n"+
" </td>\n"+
" </tr>\n"+
" <tr class=\"oddformrow\">\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" <input type=\"hidden\" name=\""+instanceOpName+"\" value=\"\"/>\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"specfl"+instanceDescription+"\" value=\""+nodeFlavor+"\"/>\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"specty"+instanceDescription+"\" value=\""+nodeType+"\"/>\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"specma"+instanceDescription+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nodeMatch)+"\"/>\n"+
" <a name=\""+seqPrefix+"match_"+Integer.toString(k)+"_"+Integer.toString(j)+"\">\n"+
" <input type=\"button\" value=\"" + Messages.getAttributeString(locale,"FileConnector.Delete") + "\" onClick='Javascript:"+seqPrefix+"SpecOp(\""+instanceOpName+"\",\"Delete\",\""+seqPrefix+"match_"+Integer.toString(k)+"_"+Integer.toString(j)+"\")' alt=\""+Messages.getAttributeString(locale,"FileConnector.DeletePath")+Integer.toString(k)+", match spec #"+Integer.toString(j)+"\"/>\n"+
" </a>\n"+
" </nobr>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" "+nodeFlavor+"\n"+
" </nobr>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" "+nodeType+"\n"+
" </nobr>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" "+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(nodeMatch)+"\n"+
" </nobr>\n"+
" </td>\n"+
" </tr>\n"
);
j++;
}
if (j == 0)
{
out.print(
" <tr class=\"formrow\"><td class=\"formcolumnmessage\" colspan=\"4\">" + Messages.getBodyString(locale,"FileConnector.NoRulesDefined") + "</td></tr>\n"
);
}
out.print(
" <tr class=\"formrow\"><td class=\"lightseparator\" colspan=\"4\"><hr/></td></tr>\n"+
" <tr class=\"formrow\">\n"+
" <td class=\"formcolumncell\">\n"+
" <a name=\""+seqPrefix+"match_"+Integer.toString(k)+"_"+Integer.toString(j)+"\">\n"+
" <input type=\"button\" value=\"" + Messages.getAttributeString(locale,"FileConnector.Add") + "\" onClick='Javascript:"+seqPrefix+"SpecOp(\""+pathOpName+"\",\"Add\",\""+seqPrefix+"match_"+Integer.toString(k)+"_"+Integer.toString(j+1)+"\")' alt=\""+Messages.getAttributeString(locale,"FileConnector.AddNewMatchForPath")+Integer.toString(k)+"\"/>\n"+
" </a>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" <select name=\""+seqPrefix+"specflavor"+pathDescription+"\">\n"+
" <option value=\"include\">" + Messages.getBodyString(locale,"FileConnector.include") + "</option>\n"+
" <option value=\"exclude\">" + Messages.getBodyString(locale,"FileConnector.exclude") + "</option>\n"+
" </select>\n"+
" </nobr>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" <select name=\""+seqPrefix+"spectype"+pathDescription+"\">\n"+
" <option value=\"file\">" + Messages.getBodyString(locale,"FileConnector.File") + "</option>\n"+
" <option value=\"directory\">" + Messages.getBodyString(locale,"FileConnector.Directory") + "</option>\n"+
" </select>\n"+
" </nobr>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" <input type=\"text\" size=\"10\" name=\""+seqPrefix+"specmatch"+pathDescription+"\" value=\"\"/>\n"+
" </nobr>\n"+
" </td>\n"+
" </tr>\n"+
" </table>\n"+
" </td>\n"+
" </tr>\n"
);
k++;
}
}
if (k == 0)
{
out.print(
" <tr class=\"formrow\"><td class=\"formcolumnmessage\" colspan=\"4\">" + Messages.getBodyString(locale,"FileConnector.NoDocumentsSpecified") + "</td></tr>\n"
);
}
out.print(
" <tr class=\"formrow\"><td class=\"lightseparator\" colspan=\"4\"><hr/></td></tr>\n"+
" <tr class=\"formrow\">\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" <a name=\""+seqPrefix+"path_"+Integer.toString(k)+"\">\n"+
" <input type=\"button\" value=\"" + Messages.getAttributeString(locale,"FileConnector.Add") + "\" onClick='Javascript:"+seqPrefix+"SpecOp(\""+seqPrefix+"specop\",\"Add\",\""+seqPrefix+"path_"+Integer.toString(i+1)+"\")' alt=\"" + Messages.getAttributeString(locale,"FileConnector.AddNewPath") + "\"/>\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"pathcount\" value=\""+Integer.toString(k)+"\"/>\n"+
" <input type=\"hidden\" name=\""+seqPrefix+"specop\" value=\"\"/>\n"+
" </a>\n"+
" </nobr>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" <input type=\"text\" size=\"30\" name=\""+seqPrefix+"specpath\" value=\"\"/>\n"+
" </nobr>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" <input name=\""+seqPrefix+"converttouri\" type=\"checkbox\" value=\"true\"/>\n"+
" </nobr>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">\n"+
" </td>\n"+
" </tr>\n"+
" </table>\n"+
" </td>\n"+
" </tr>\n"+
"</table>\n"
);
}
else
{
i = 0;
k = 0;
while (i < ds.getChildCount())
{
SpecificationNode sn = ds.getChild(i++);
if (sn.getType().equals("startpoint"))
{
String pathDescription = "_"+Integer.toString(k);
String path = sn.getAttributeValue("path");
String convertToURIString = sn.getAttributeValue("converttouri");
boolean convertToURI = false;
if (convertToURIString != null && convertToURIString.equals("true"))
convertToURI = true;
out.print(
"<input type=\"hidden\" name=\""+seqPrefix+"specpath"+pathDescription+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(path)+"\"/>\n"+
"<input type=\"hidden\" name=\""+seqPrefix+"converttouri"+pathDescription+"\" value=\""+(convertToURI?"true":"false")+"\">\n"+
"<input type=\"hidden\" name=\""+seqPrefix+"specchildcount"+pathDescription+"\" value=\""+Integer.toString(sn.getChildCount())+"\"/>\n"
);
int j = 0;
while (j < sn.getChildCount())
{
SpecificationNode excludeNode = sn.getChild(j);
String instanceDescription = "_"+Integer.toString(k)+"_"+Integer.toString(j);
String nodeFlavor = excludeNode.getType();
String nodeType = excludeNode.getAttributeValue("type");
String nodeMatch = excludeNode.getAttributeValue("match");
out.print(
"<input type=\"hidden\" name=\""+seqPrefix+"specfl"+instanceDescription+"\" value=\""+nodeFlavor+"\"/>\n"+
"<input type=\"hidden\" name=\""+seqPrefix+"specty"+instanceDescription+"\" value=\""+nodeType+"\"/>\n"+
"<input type=\"hidden\" name=\""+seqPrefix+"specma"+instanceDescription+"\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(nodeMatch)+"\"/>\n"
);
j++;
}
k++;
}
}
out.print(
"<input type=\"hidden\" name=\""+seqPrefix+"pathcount\" value=\""+Integer.toString(k)+"\"/>\n"
);
}
}
/** Process a specification post.
* This method is called at the start of job's edit or view page, whenever there is a possibility that form
* data for a connection has been posted. Its purpose is to gather form information and modify the
* document specification accordingly. The name of the posted form is always "editjob".
* The connector will be connected before this method can be called.
*@param variableContext contains the post data, including binary file-upload information.
*@param locale is the locale the output is preferred to be in.
*@param ds is the current document specification for this job.
*@param connectionSequenceNumber is the unique number of this connection within the job.
*@return null if all is well, or a string error message if there is an error that should prevent saving of
* the job (and cause a redirection to an error page).
*/
@Override
public String processSpecificationPost(IPostParameters variableContext, Locale locale, Specification ds,
int connectionSequenceNumber)
throws ManifoldCFException
{
String seqPrefix = "s"+connectionSequenceNumber+"_";
String x = variableContext.getParameter(seqPrefix+"pathcount");
if (x != null)
{
ds.clearChildren();
// Find out how many children were sent
int pathCount = Integer.parseInt(x);
// Gather up these
int i = 0;
int k = 0;
while (i < pathCount)
{
String pathDescription = "_"+Integer.toString(i);
String pathOpName = seqPrefix+"specop"+pathDescription;
x = variableContext.getParameter(pathOpName);
if (x != null && x.equals("Delete"))
{
// Skip to the next
i++;
continue;
}
// Path inserts won't happen until the very end
String path = variableContext.getParameter(seqPrefix+"specpath"+pathDescription);
String convertToURI = variableContext.getParameter(seqPrefix+"converttouri"+pathDescription);
SpecificationNode node = new SpecificationNode("startpoint");
node.setAttribute("path",path);
if (convertToURI != null)
node.setAttribute("converttouri",convertToURI);
// Now, get the number of children
String y = variableContext.getParameter(seqPrefix+"specchildcount"+pathDescription);
int childCount = Integer.parseInt(y);
int j = 0;
int w = 0;
while (j < childCount)
{
String instanceDescription = "_"+Integer.toString(i)+"_"+Integer.toString(j);
// Look for an insert or a delete at this point
String instanceOp = seqPrefix+"specop"+instanceDescription;
String z = variableContext.getParameter(instanceOp);
String flavor;
String type;
String match;
SpecificationNode sn;
if (z != null && z.equals("Delete"))
{
// Process the deletion as we gather
j++;
continue;
}
if (z != null && z.equals("Insert Here"))
{
// Process the insertion as we gather.
flavor = variableContext.getParameter(seqPrefix+"specflavor"+instanceDescription);
type = variableContext.getParameter(seqPrefix+"spectype"+instanceDescription);
match = variableContext.getParameter(seqPrefix+"specmatch"+instanceDescription);
sn = new SpecificationNode(flavor);
sn.setAttribute("type",type);
sn.setAttribute("match",match);
node.addChild(w++,sn);
}
flavor = variableContext.getParameter(seqPrefix+"specfl"+instanceDescription);
type = variableContext.getParameter(seqPrefix+"specty"+instanceDescription);
match = variableContext.getParameter(seqPrefix+"specma"+instanceDescription);
sn = new SpecificationNode(flavor);
sn.setAttribute("type",type);
sn.setAttribute("match",match);
node.addChild(w++,sn);
j++;
}
if (x != null && x.equals("Add"))
{
// Process adds to the end of the rules in-line
String match = variableContext.getParameter(seqPrefix+"specmatch"+pathDescription);
String type = variableContext.getParameter(seqPrefix+"spectype"+pathDescription);
String flavor = variableContext.getParameter(seqPrefix+"specflavor"+pathDescription);
SpecificationNode sn = new SpecificationNode(flavor);
sn.setAttribute("type",type);
sn.setAttribute("match",match);
node.addChild(w,sn);
}
ds.addChild(k++,node);
i++;
}
// See if there's a global add operation
String op = variableContext.getParameter(seqPrefix+"specop");
if (op != null && op.equals("Add"))
{
String path = variableContext.getParameter(seqPrefix+"specpath");
String convertToURI = variableContext.getParameter(seqPrefix+"converttouri");
SpecificationNode node = new SpecificationNode("startpoint");
node.setAttribute("path",path);
if (convertToURI != null)
node.setAttribute("converttouri",convertToURI);
// Now add in the defaults; these will be "include all directories" and "include all files".
SpecificationNode sn = new SpecificationNode("include");
sn.setAttribute("type","file");
sn.setAttribute("match","*");
node.addChild(node.getChildCount(),sn);
sn = new SpecificationNode("include");
sn.setAttribute("type","directory");
sn.setAttribute("match","*");
node.addChild(node.getChildCount(),sn);
ds.addChild(k,node);
}
}
return null;
}
/** View specification.
* This method is called in the body section of a job's view page. Its purpose is to present the document
* specification information to the user. The coder can presume that the HTML that is output from
* this configuration will be within appropriate <html> and <body> tags.
* The connector will be connected before this method can be called.
*@param out is the output to which any HTML should be sent.
*@param locale is the locale the output is preferred to be in.
*@param ds is the current document specification for this job.
*@param connectionSequenceNumber is the unique number of this connection within the job.
*/
@Override
public void viewSpecification(IHTTPOutput out, Locale locale, Specification ds,
int connectionSequenceNumber)
throws ManifoldCFException, IOException
{
out.print(
"<table class=\"displaytable\">\n"+
" <tr>\n"+
" <td class=\"description\">" + Messages.getAttributeString(locale,"FileConnector.Paths2") + "</td>\n"+
" <td class=\"boxcell\">\n"+
" <table class=\"formtable\">\n"+
" <tr class=\"formheaderrow\">\n"+
" <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale,"FileConnector.RootPath") + "</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale,"FileConnector.ConvertToURI") + "<br/>" + Messages.getBodyString(locale,"FileConnector.ConvertToURIExample")+ "</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale,"FileConnector.Rules") + "</nobr></td>\n"+
" </tr>\n"
);
int k = 0;
for (int i = 0; i < ds.getChildCount(); i++)
{
SpecificationNode sn = ds.getChild(i);
if (sn.getType().equals("startpoint"))
{
String path = sn.getAttributeValue("path");
String convertToURIString = sn.getAttributeValue("converttouri");
boolean convertToURI = false;
if (convertToURIString != null && convertToURIString.equals("true"))
convertToURI = true;
out.print(
" <tr class=\""+(((k % 2)==0)?"evenformrow":"oddformrow")+"\">\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" "+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(path)+" \n"+
" </nobr>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" "+(convertToURI?Messages.getBodyString(locale,"FileConnector.Yes"):Messages.getBodyString(locale,"FileConnector.No"))+" \n"+
" </nobr>\n"+
" </td>\n"+
" <td class=\"boxcell\">\n"+
" <table class=\"formtable\">\n"+
" <tr class=\"formheaderrow\">\n"+
" <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale,"FileConnector.IncludeExclude") + "</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale,"FileConnector.FileDirectory") + "</nobr></td>\n"+
" <td class=\"formcolumnheader\"><nobr>" + Messages.getBodyString(locale,"FileConnector.Match") + "</nobr></td>\n"+
" </tr>\n"
);
int l = 0;
for (int j = 0; j < sn.getChildCount(); j++)
{
SpecificationNode excludeNode = sn.getChild(j);
String nodeFlavor = excludeNode.getType();
String nodeType = excludeNode.getAttributeValue("type");
String nodeMatch = excludeNode.getAttributeValue("match");
out.print(
" <tr class=\""+(((l % 2)==0)?"evenformrow":"oddformrow")+"\">\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" "+nodeFlavor+"\n"+
" </nobr>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" "+nodeType+"\n"+
" </nobr>\n"+
" </td>\n"+
" <td class=\"formcolumncell\">\n"+
" <nobr>\n"+
" "+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(nodeMatch)+"\n"+
" </nobr>\n"+
" </td>\n"+
" </tr>\n"
);
l++;
}
if (l == 0)
{
out.print(
" <tr><td class=\"formcolumnmessage\" colspan=\"3\">" + Messages.getBodyString(locale,"FileConnector.NoRulesDefined") + "</td></tr>\n"
);
}
out.print(
" </table>\n"+
" </td>\n"
);
out.print(
" </tr>\n"
);
k++;
}
}
if (k == 0)
{
out.print(
" <tr><td class=\"formcolumnmessage\" colspan=\"3\">" + Messages.getBodyString(locale,"FileConnector.NoDocumentsSpecified") + "</td></tr>\n"
);
}
out.print(
" </table>\n"+
" </td>\n"+
" </tr>\n"
);
out.print(
"</table>\n"
);
}
// Protected static methods
/** Check if a file or directory should be included, given a document specification.
*@param fileName is the canonical file name.
*@param documentSpecification is the specification.
*@return true if it should be included.
*/
protected static boolean checkInclude(File file, String fileName, Specification documentSpecification)
throws ManifoldCFException
{
if (Logging.connectors.isDebugEnabled())
{
Logging.connectors.debug("Checking whether to include file '"+fileName+"'");
}
try
{
String pathPart;
String filePart;
if (file.isDirectory())
{
pathPart = fileName;
filePart = null;
}
else
{
pathPart = file.getParentFile().getCanonicalPath();
filePart = file.getName();
}
// Scan until we match a startpoint
int i = 0;
while (i < documentSpecification.getChildCount())
{
SpecificationNode sn = documentSpecification.getChild(i++);
if (sn.getType().equals("startpoint"))
{
String path = new File(sn.getAttributeValue("path")).getCanonicalPath();
if (Logging.connectors.isDebugEnabled())
{
Logging.connectors.debug("Checking path '"+path+"' against canonical '"+pathPart+"'");
}
// Compare with filename
int matchEnd = matchSubPath(path,pathPart);
if (matchEnd == -1)
{
if (Logging.connectors.isDebugEnabled())
{
Logging.connectors.debug("Match check '"+path+"' against canonical '"+pathPart+"' failed");
}
continue;
}
// matchEnd is the start of the rest of the path (after the match) in fileName.
// We need to walk through the rules and see whether it's in or out.
int j = 0;
while (j < sn.getChildCount())
{
SpecificationNode node = sn.getChild(j++);
String flavor = node.getType();
String match = node.getAttributeValue("match");
String type = node.getAttributeValue("type");
// If type is "file", then our match string is against the filePart.
// If filePart is null, then this rule is simply skipped.
String sourceMatch;
int sourceIndex;
if (type.equals("file"))
{
if (filePart == null)
continue;
sourceMatch = filePart;
sourceIndex = 0;
}
else
{
if (filePart != null)
continue;
sourceMatch = pathPart;
sourceIndex = matchEnd;
}
if (flavor.equals("include"))
{
if (checkMatch(sourceMatch,sourceIndex,match))
return true;
}
else if (flavor.equals("exclude"))
{
if (checkMatch(sourceMatch,sourceIndex,match))
return false;
}
}
}
}
if (Logging.connectors.isDebugEnabled())
{
Logging.connectors.debug("Not including '"+fileName+"' because no matching rules");
}
return false;
}
catch (IOException e)
{
throw new ManifoldCFException("IO Error",e);
}
}
/** Check if a file should be ingested, given a document specification. It is presumed that
* documents that do not pass checkInclude() will be checked with this method.
*@param file is the file.
*@param documentSpecification is the specification.
*/
protected static boolean checkIngest(File file, Specification documentSpecification)
throws ManifoldCFException
{
// Since the only exclusions at this point are not based on file contents, this is a no-op.
// MHL
return true;
}
/** Match a sub-path. The sub-path must match the complete starting part of the full path, in a path
* sense. The returned value should point into the file name beyond the end of the matched path, or
* be -1 if there is no match.
*@param subPath is the sub path.
*@param fullPath is the full path.
*@return the index of the start of the remaining part of the full path, or -1.
*/
protected static int matchSubPath(String subPath, String fullPath)
{
if (subPath.length() > fullPath.length())
return -1;
if (fullPath.startsWith(subPath) == false)
return -1;
int rval = subPath.length();
if (fullPath.length() == rval)
return rval;
char x = fullPath.charAt(rval);
if (x == File.separatorChar)
rval++;
return rval;
}
/** Check a match between two strings with wildcards.
*@param sourceMatch is the expanded string (no wildcards)
*@param sourceIndex is the starting point in the expanded string.
*@param match is the wildcard-based string.
*@return true if there is a match.
*/
protected static boolean checkMatch(String sourceMatch, int sourceIndex, String match)
{
// Note: The java regex stuff looks pretty heavyweight for this purpose.
// I've opted to try and do a simple recursive version myself, which is not compiled.
// Basically, the match proceeds by recursive descent through the string, so that all *'s cause
// recursion.
boolean caseSensitive = true;
return processCheck(caseSensitive, sourceMatch, sourceIndex, match, 0);
}
/** Recursive worker method for checkMatch. Returns 'true' if there is a path that consumes both
* strings in their entirety in a matched way.
*@param caseSensitive is true if file names are case sensitive.
*@param sourceMatch is the source string (w/o wildcards)
*@param sourceIndex is the current point in the source string.
*@param match is the match string (w/wildcards)
*@param matchIndex is the current point in the match string.
*@return true if there is a match.
*/
protected static boolean processCheck(boolean caseSensitive, String sourceMatch, int sourceIndex,
String match, int matchIndex)
{
// Logging.connectors.debug("Matching '"+sourceMatch+"' position "+Integer.toString(sourceIndex)+
// " against '"+match+"' position "+Integer.toString(matchIndex));
// Match up through the next * we encounter
while (true)
{
// If we've reached the end, it's a match.
if (sourceMatch.length() == sourceIndex && match.length() == matchIndex)
return true;
// If one has reached the end but the other hasn't, no match
if (match.length() == matchIndex)
return false;
if (sourceMatch.length() == sourceIndex)
{
if (match.charAt(matchIndex) != '*')
return false;
matchIndex++;
continue;
}
char x = sourceMatch.charAt(sourceIndex);
char y = match.charAt(matchIndex);
if (!caseSensitive)
{
if (x >= 'A' && x <= 'Z')
x -= 'A'-'a';
if (y >= 'A' && y <= 'Z')
y -= 'A'-'a';
}
if (y == '*')
{
// Wildcard!
// We will recurse at this point.
// Basically, we want to combine the results for leaving the "*" in the match string
// at this point and advancing the source index, with skipping the "*" and leaving the source
// string alone.
return processCheck(caseSensitive,sourceMatch,sourceIndex+1,match,matchIndex) ||
processCheck(caseSensitive,sourceMatch,sourceIndex,match,matchIndex+1);
}
if (y == '?' || x == y)
{
sourceIndex++;
matchIndex++;
}
else
return false;
}
}
}