/* $Id: GTSConnector.java 988245 2010-08-23 18:39:35Z kwright $ */
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.agents.output.gts;
import org.apache.manifoldcf.core.interfaces.*;
import org.apache.manifoldcf.agents.interfaces.*;
import org.apache.manifoldcf.agents.system.Logging;
import org.apache.manifoldcf.connectorcommon.interfaces.*;
// POIFS stuff
import org.apache.poi.poifs.eventfilesystem.POIFSReader;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.filesystem.POIFSDocumentPath;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hpsf.PropertySetFactory;
import org.apache.poi.hpsf.PropertySet;
import org.apache.poi.hpsf.NoPropertySetStreamException;
import org.apache.poi.hpsf.MarkUnsupportedException;
import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
import java.util.*;
import java.io.*;
/** This is the output connector for the MetaCarta appliance. It establishes a notion of
* collection(s) a document is ingested into, as well as the idea of a document template for the
* output.
*/
public class GTSConnector extends org.apache.manifoldcf.agents.output.BaseOutputConnector
{
public static final String _rcsid = "@(#)$Id: GTSConnector.java 988245 2010-08-23 18:39:35Z kwright $";
// Activities we log
/** Ingestion activity */
public final static String INGEST_ACTIVITY = "document ingest";
/** Document removal activity */
public final static String REMOVE_ACTIVITY = "document deletion";
// These are the document types the fingerprinter understands
protected static final int DT_UNKNOWN = -1;
protected static final int DT_COMPOUND_DOC = 0;
protected static final int DT_MSWORD = 1;
protected static final int DT_MSEXCEL = 2;
protected static final int DT_MSPOWERPOINT = 3;
protected static final int DT_MSOUTLOOK = 4;
protected static final int DT_TEXT = 5;
protected static final int DT_ZERO = 6;
protected static final int DT_PDF = 7;
/** Local data */
protected HttpPoster poster = null;
/** Constructor.
*/
public GTSConnector()
{
}
/** Return the list of activities that this connector supports (i.e. writes into the log).
*@return the list.
*/
@Override
public String[] getActivitiesList()
{
return new String[]{INGEST_ACTIVITY,REMOVE_ACTIVITY};
}
/** Connect.
*@param configParameters is the set of configuration parameters, which
* in this case describe the target appliance, basic auth configuration, etc. (This formerly came
* out of the ini file.)
*/
@Override
public void connect(ConfigParams configParameters)
{
super.connect(configParameters);
}
/** Close the connection. Call this before discarding the connection.
*/
@Override
public void disconnect()
throws ManifoldCFException
{
poster = null;
super.disconnect();
}
/** Set up a session */
protected void getSession()
throws ManifoldCFException
{
if (poster == null)
{
String ingestURI = params.getParameter(GTSConfig.PARAM_INGESTURI);
if (ingestURI == null)
throw new ManifoldCFException("Missing parameter '"+GTSConfig.PARAM_INGESTURI+"'");
String userID = params.getParameter(GTSConfig.PARAM_USERID);
String password = params.getObfuscatedParameter(GTSConfig.PARAM_PASSWORD);
String realm = params.getParameter(GTSConfig.PARAM_REALM);
poster = new HttpPoster(currentContext,realm,userID,password,ingestURI);
}
}
/** Test the connection. Returns a string describing the connection integrity.
*@return the connection's status as a displayable string.
*/
@Override
public String check()
throws ManifoldCFException
{
try
{
getSession();
poster.checkPost();
return super.check();
}
catch (ServiceInterruption e)
{
return "Transient error: "+e.getMessage();
}
}
protected static final String[] ingestableMimeTypeArray = new String[]
{
"application/excel",
"application/powerpoint",
"application/ppt",
"application/rtf",
"application/xls",
"text/html",
"text/rtf",
"text/pdf",
"application/x-excel",
"application/x-msexcel",
"application/x-mspowerpoint",
"application/x-msword-doc",
"application/x-msword",
"application/x-word",
"Application/pdf",
"text/xml",
"no-type",
"text/plain",
"application/pdf",
"application/x-rtf",
"application/vnd.ms-excel",
"application/vnd.ms-pps",
"application/vnd.ms-powerpoint",
"application/vnd.ms-word",
"application/msword",
"application/msexcel",
"application/mspowerpoint",
"application/ms-powerpoint",
"application/ms-word",
"application/ms-excel",
"Adobe",
"application/Vnd.Ms-Excel",
"vnd.ms-powerpoint",
"application/x-pdf",
"winword",
"text/richtext",
"Text",
"Text/html",
"application/MSWORD",
"application/PDF",
"application/MSEXCEL",
"application/MSPOWERPOINT"
};
protected static final Map ingestableMimeTypeMap = new HashMap();
static
{
int i = 0;
while (i < ingestableMimeTypeArray.length)
{
String type = ingestableMimeTypeArray[i++];
ingestableMimeTypeMap.put(type,type);
}
}
/** Detect if a mime type is indexable or not. This method is used by participating repository connectors to pre-filter the number of
* unusable documents that will be passed to this output connector.
*@param mimeType is the mime type of the document.
*@return true if the mime type is indexable by this connector.
*/
@Override
public boolean checkMimeTypeIndexable(VersionContext outputDescription, String mimeType, IOutputCheckActivity activities)
throws ManifoldCFException, ServiceInterruption
{
return (ingestableMimeTypeMap.get(mimeType) != null);
}
/** Pre-determine whether a document (passed here as a File object) is indexable by this connector. This method is used by participating
* repository connectors to help reduce the number of unmanageable documents that are passed to this output connector in advance of an
* actual transfer. This hook is provided mainly to support search engines that only handle a small set of accepted file types.
*@param localFile is the local file to check.
*@return true if the file is indexable.
*/
@Override
public boolean checkDocumentIndexable(VersionContext outputDescription, File localFile, IOutputCheckActivity activities)
throws ManifoldCFException, ServiceInterruption
{
int docType = fingerprint(localFile);
return (docType == DT_TEXT ||
docType == DT_MSWORD ||
docType == DT_MSEXCEL ||
docType == DT_PDF ||
docType == DT_MSPOWERPOINT);
}
/** Get an output version string, given an output specification. The output version string is used to uniquely describe the pertinent details of
* the output specification and the configuration, to allow the Connector Framework to determine whether a document will need to be output again.
* Note that the contents of the document cannot be considered by this method, and that a different version string (defined in IRepositoryConnector)
* is used to describe the version of the actual document.
*
* This method presumes that the connector object has been configured, and it is thus able to communicate with the output data store should that be
* necessary.
*@param spec is the current output specification for the job that is doing the crawling.
*@return a string, of unlimited length, which uniquely describes output configuration and specification in such a way that if two such strings are equal,
* the document will not need to be sent again to the output data store.
*/
@Override
public VersionContext getPipelineDescription(Specification spec)
throws ManifoldCFException, ServiceInterruption
{
List<String> collectionList = new ArrayList<String>();
String documentTemplate = "";
for (int i = 0; i < spec.getChildCount(); i++)
{
SpecificationNode sn = spec.getChild(i);
if (sn.getType().equals(GTSConfig.NODE_COLLECTION))
{
collectionList.add(sn.getAttributeValue(GTSConfig.ATTRIBUTE_VALUE));
}
else if (sn.getType().equals(GTSConfig.NODE_DOCUMENTTEMPLATE))
{
documentTemplate = sn.getAttributeValue(GTSConfig.ATTRIBUTE_VALUE);
}
}
// Get the config info too. This will be constant for any given connector instance, so we don't have to worry about it changing
// out from under us.
String ingestURI = params.getParameter(GTSConfig.PARAM_INGESTURI);
// Now, construct the appropriate string
// The information we want in this string is:
// (1) the collection name(s), in sorted order.
// (2) the document template
// (3) the ingest URI
String[] sortArray = new String[collectionList.size()];
int j = 0;
for (String collection : collectionList)
{
sortArray[j++] = collection;
}
java.util.Arrays.sort(sortArray);
StringBuilder sb = new StringBuilder();
packList(sb,sortArray,'+');
pack(sb,documentTemplate,'+');
// From here on down, unpacking is unnecessary.
sb.append(ingestURI);
return new VersionContext(sb.toString(),params,spec);
}
/** Add (or replace) a document in the output data store using the connector.
* This method presumes that the connector object has been configured, and it is thus able to communicate with the output data store should that be
* necessary.
*@param documentURI is the URI of the document. The URI is presumed to be the unique identifier which the output data store will use to process
* and serve the document. This URI is constructed by the repository connector which fetches the document, and is thus universal across all output connectors.
*@param pipelineDescription includes the description string that was constructed for this document by the getOutputDescription() method.
*@param document is the document data to be processed (handed to the output data store).
*@param authorityNameString is the name of the authority responsible for authorizing any access tokens passed in with the repository document. May be null.
*@param activities is the handle to an object that the implementer of a pipeline connector may use to perform operations, such as logging processing activity,
* or sending a modified document to the next stage in the pipeline.
*@return the document status (accepted or permanently rejected).
*@throws IOException only if there's a stream error reading the document data.
*/
@Override
public int addOrReplaceDocumentWithException(String documentURI, VersionContext pipelineDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
throws ManifoldCFException, ServiceInterruption, IOException
{
// Grab the information we need to index
Specification spec = pipelineDescription.getSpecification();
List<String> collectionList = new ArrayList<String>();
String documentTemplate = "";
for (int i = 0; i < spec.getChildCount(); i++)
{
SpecificationNode sn = spec.getChild(i);
if (sn.getType().equals(GTSConfig.NODE_COLLECTION))
{
collectionList.add(sn.getAttributeValue(GTSConfig.ATTRIBUTE_VALUE));
}
else if (sn.getType().equals(GTSConfig.NODE_DOCUMENTTEMPLATE))
{
documentTemplate = sn.getAttributeValue(GTSConfig.ATTRIBUTE_VALUE);
}
}
// Establish a session
getSession();
// Now, go off and call the ingest API.
if (poster.indexPost(documentURI,collectionList,documentTemplate,authorityNameString,document,activities))
return DOCUMENTSTATUS_ACCEPTED;
return DOCUMENTSTATUS_REJECTED;
}
/** Remove a document using the connector.
* Note that the last outputDescription is included, since it may be necessary for the connector to use such information to know how to properly remove the document.
*@param documentURI is the URI of the document. The URI is presumed to be the unique identifier which the output data store will use to process
* and serve the document. This URI is constructed by the repository connector which fetches the document, and is thus universal across all output connectors.
*@param outputDescription is the last description string that was constructed for this document by the getOutputDescription() method above.
*@param activities is the handle to an object that the implementer of an output connector may use to perform operations, such as logging processing activity.
*/
@Override
public void removeDocument(String documentURI, String outputDescription, IOutputRemoveActivity activities)
throws ManifoldCFException, ServiceInterruption
{
// Establish a session
getSession();
// Call the ingestion API.
poster.deletePost(documentURI,activities);
}
// UI support methods.
//
// These support methods come in two varieties. The first bunch is involved in setting up connection configuration information. The second bunch
// is involved in presenting and editing output specification information for a job. The two kinds of methods are accordingly treated differently,
// in that the first bunch cannot assume that the current connector object is connected, while the second bunch can. That is why the first bunch
// receives a thread context argument for all UI methods, while the second bunch does not need one (since it has already been applied via the connect()
// method, above).
/** Output the configuration header section.
* This method is called in the head section of the connector's configuration page. Its purpose is to add the required tabs to the list, and to output any
* javascript methods that might be needed by the configuration editing HTML.
*@param threadContext is the local thread context.
*@param out is the output to which any HTML should be sent.
*@param parameters are the configuration parameters, as they currently exist, for this connection being configured.
*@param tabsArray is an array of tab names. Add to this array any tab names that are specific to the connector.
*/
@Override
public void outputConfigurationHeader(IThreadContext threadContext, IHTTPOutput out,
Locale locale, ConfigParams parameters, List<String> tabsArray)
throws ManifoldCFException, IOException
{
tabsArray.add(Messages.getString(locale,"GTSConnector.Appliance"));
out.print(
"\n"+
"<script type=\"text/javascript\">\n"+
"<!--\n"+
"function checkConfig()\n"+
"{\n"+
" if (editconnection.ingesturi.value == \"\")\n"+
" {\n"+
" alert(\"" + Messages.getBodyJavascriptString(locale,"GTSConnector.PleaseSupplyAValidIngestionURI") + "\");\n"+
" editconnection.ingesturi.focus();\n"+
" return false;\n"+
" }\n"+
" return true;\n"+
"}\n"+
"\n"+
"function checkConfigForSave()\n"+
"{\n"+
" if (editconnection.ingesturi.value == \"\")\n"+
" {\n"+
" alert(\"" + Messages.getBodyJavascriptString(locale,"GTSConnector.PleaseSupplyAValidIngestionURI") + "\");\n"+
" SelectTab(\"" + Messages.getBodyJavascriptString(locale,"GTSConnector.Appliance") + "\");\n"+
" editconnection.ingesturi.focus();\n"+
" return false;\n"+
" }\n"+
" return true;\n"+
"}\n"+
"\n"+
"//-->\n"+
"</script>\n"
);
}
/** Output the configuration body section.
* This method is called in the body section of the connector's configuration page. Its purpose is to present the required form elements for editing.
* The coder can presume that the HTML that is output from this configuration will be within appropriate <html>, <body>, and <form> tags. The name of the
* form is "editconnection".
*@param threadContext is the local thread context.
*@param out is the output to which any HTML should be sent.
*@param parameters are the configuration parameters, as they currently exist, for this connection being configured.
*@param tabName is the current tab name.
*/
@Override
public void outputConfigurationBody(IThreadContext threadContext, IHTTPOutput out,
Locale locale, ConfigParams parameters, String tabName)
throws ManifoldCFException, IOException
{
String ingestURI = parameters.getParameter(org.apache.manifoldcf.agents.output.gts.GTSConfig.PARAM_INGESTURI);
if (ingestURI == null)
ingestURI = "http://localhost:7031/HTTPIngest";
String realm = parameters.getParameter(org.apache.manifoldcf.agents.output.gts.GTSConfig.PARAM_REALM);
if (realm == null)
realm = "";
String userID = parameters.getParameter(org.apache.manifoldcf.agents.output.gts.GTSConfig.PARAM_USERID);
if (userID == null)
userID = "";
String password = parameters.getObfuscatedParameter(org.apache.manifoldcf.agents.output.gts.GTSConfig.PARAM_PASSWORD);
if (password == null)
password = "";
// "Appliance" tab
if (tabName.equals(Messages.getString(locale,"GTSConnector.Appliance")))
{
out.print(
"\n"+
"<table class=\"displaytable\">\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"GTSConnector.IngestURI") + "</nobr></td>\n"+
" <td class=\"value\">\n"+
" <input name=\"ingesturi\" type=\"text\" size=\"32\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(ingestURI)+"\"/>\n"+
" </td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"GTSConnector.Realm") + "</nobr></td>\n"+
" <td class=\"value\">\n"+
" <input name=\"realm\" type=\"text\" size=\"32\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(realm)+"\"/>\n"+
" </td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"GTSConnector.UserID") + "</nobr></td>\n"+
" <td class=\"value\">\n"+
" <input name=\"userid\" type=\"text\" size=\"32\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(userID)+"\"/>\n"+
" </td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"GTSConnector.Password") + "</nobr></td>\n"+
" <td class=\"value\">\n"+
" <input type=\"password\" size=\"32\" name=\"password\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(password)+"\"/>\n"+
" </td>\n"+
" </tr>\n"+
"</table>\n"
);
}
else
{
// Appliance tab hiddens
out.print("\n"+
"<input type=\"hidden\" name=\"ingesturi\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(ingestURI)+"\"/>\n"+
"<input type=\"hidden\" name=\"userid\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(userID)+"\"/>\n"+
"<input type=\"hidden\" name=\"password\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(password)+"\"/>\n"
);
}
}
/** Process a configuration post.
* This method is called at the start of the connector's configuration page, whenever there is a possibility that form data for a connection has been
* posted. Its purpose is to gather form information and modify the configuration parameters accordingly.
* The name of the posted form is "editconnection".
*@param threadContext is the local thread context.
*@param variableContext is the set of variables available from the post, including binary file post information.
*@param parameters are the configuration parameters, as they currently exist, for this connection being configured.
*@return null if all is well, or a string error message if there is an error that should prevent saving of the connection (and cause a redirection to an error page).
*/
@Override
public String processConfigurationPost(IThreadContext threadContext, IPostParameters variableContext,
Locale locale, ConfigParams parameters)
throws ManifoldCFException
{
String ingestURI = variableContext.getParameter("ingesturi");
if (ingestURI != null)
parameters.setParameter(org.apache.manifoldcf.agents.output.gts.GTSConfig.PARAM_INGESTURI,ingestURI);
String realm = variableContext.getParameter("realm");
if (realm != null)
parameters.setParameter(org.apache.manifoldcf.agents.output.gts.GTSConfig.PARAM_REALM,realm);
String userID = variableContext.getParameter("userid");
if (userID != null)
parameters.setParameter(org.apache.manifoldcf.agents.output.gts.GTSConfig.PARAM_USERID,userID);
String password = variableContext.getParameter("password");
if (password != null)
parameters.setObfuscatedParameter(org.apache.manifoldcf.agents.output.gts.GTSConfig.PARAM_PASSWORD,password);
return null;
}
/** View configuration.
* This method is called in the body section of the connector's view configuration page. Its purpose is to present the connection information to the user.
* The coder can presume that the HTML that is output from this configuration will be within appropriate <html> and <body> tags.
*@param threadContext is the local thread context.
*@param out is the output to which any HTML should be sent.
*@param parameters are the configuration parameters, as they currently exist, for this connection being configured.
*/
@Override
public void viewConfiguration(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters)
throws ManifoldCFException, IOException
{
out.print(
"\n"+
"<table class=\"displaytable\">\n"+
" <tr>\n"+
" <td class=\"description\" colspan=\"1\"><nobr>" + Messages.getBodyString(locale,"GTSConnector.Parameters") + "</nobr></td>\n"+
" <td class=\"value\" colspan=\"3\">\n"
);
Iterator iter = parameters.listParameters();
while (iter.hasNext())
{
String param = (String)iter.next();
String value = parameters.getParameter(param);
if (param.length() >= "password".length() && param.substring(param.length()-"password".length()).equalsIgnoreCase("password"))
{
out.print(
" <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(param)+"=********</nobr><br/>\n"
);
}
else if (param.length() >="keystore".length() && param.substring(param.length()-"keystore".length()).equalsIgnoreCase("keystore"))
{
IKeystoreManager kmanager = KeystoreManagerFactory.make("",value);
out.print(
" <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(param)+"=<"+Integer.toString(kmanager.getContents().length)+" certificate(s)></nobr><br/>\n"
);
}
else
{
out.print(
" <nobr>"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(param)+"="+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(value)+"</nobr><br/>\n"
);
}
}
out.print(
" </td>\n"+
" </tr>\n"+
"</table>\n"
);
}
/** Output the specification header section.
* This method is called in the head section of a job page which has selected a pipeline connection of the current type. Its purpose is to add the required tabs
* to the list, and to output any javascript methods that might be needed by the job editing HTML.
*@param out is the output to which any HTML should be sent.
*@param locale is the preferred local of the output.
*@param os is the current pipeline specification for this connection.
*@param connectionSequenceNumber is the unique number of this connection within the job.
*@param tabsArray is an array of tab names. Add to this array any tab names that are specific to the connector.
*/
@Override
public void outputSpecificationHeader(IHTTPOutput out, Locale locale, Specification os,
int connectionSequenceNumber, List<String> tabsArray)
throws ManifoldCFException, IOException
{
String seqPrefix = "s"+connectionSequenceNumber+"_";
tabsArray.add(Messages.getString(locale,"GTSConnector.GTSCollections"));
tabsArray.add(Messages.getString(locale,"GTSConnector.GTSTemplate"));
out.print(
"\n"+
"<script type=\"text/javascript\">\n"+
"<!--\n"+
"\n"+
"function "+seqPrefix+"checkSpecification()\n"+
"{\n"+
" if (editjob."+seqPrefix+"gts_collectionname.value.length > 230)\n"+
" {\n"+
" alert(\"" + Messages.getBodyJavascriptString(locale,"GTSConnector.CollectionNameMustBeLessThanOrEqualToCharacters") + "\");\n"+
" editjob."+seqPrefix+"gts_collectionname.focus();\n"+
" return false;\n"+
" }\n"+
" return true;\n"+
"}\n"+
"\n"+
"//-->\n"+
"</script>\n"
);
}
/** Output the specification body section.
* This method is called in the body section of a job page which has selected a pipeline connection of the current type. Its purpose is to present the required form elements for editing.
* The coder can presume that the HTML that is output from this configuration will be within appropriate <html>, <body>, and <form> tags. The name of the
* form is "editjob".
*@param out is the output to which any HTML should be sent.
*@param locale is the preferred local of the output.
*@param os is the current pipeline specification for this job.
*@param connectionSequenceNumber is the unique number of this connection within the job.
*@param actualSequenceNumber is the connection within the job that has currently been selected.
*@param tabName is the current tab name.
*/
@Override
public void outputSpecificationBody(IHTTPOutput out, Locale locale, Specification os,
int connectionSequenceNumber, int actualSequenceNumber, String tabName)
throws ManifoldCFException, IOException
{
String seqPrefix = "s"+connectionSequenceNumber+"_";
int i = 0;
String collectionName = null;
String documentTemplate = null;
while (i < os.getChildCount())
{
SpecificationNode sn = os.getChild(i++);
if (sn.getType().equals(org.apache.manifoldcf.agents.output.gts.GTSConfig.NODE_COLLECTION))
{
collectionName = sn.getAttributeValue(org.apache.manifoldcf.agents.output.gts.GTSConfig.ATTRIBUTE_VALUE);
}
else if (sn.getType().equals(org.apache.manifoldcf.agents.output.gts.GTSConfig.NODE_DOCUMENTTEMPLATE))
{
documentTemplate = sn.getAttributeValue(org.apache.manifoldcf.agents.output.gts.GTSConfig.ATTRIBUTE_VALUE);
}
}
if (collectionName == null)
collectionName = "";
if (documentTemplate == null)
documentTemplate = "";
// Collections tab
if (tabName.equals(Messages.getString(locale,"GTSConnector.GTSCollections")) && connectionSequenceNumber == actualSequenceNumber)
{
out.print(
"<table class=\"displaytable\">\n"+
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"GTSConnector.CollectionName") + "</nobr></td>\n"+
" <td class=\"value\">\n"+
" <input name=\""+seqPrefix+"gts_collectionname\" type=\"text\" size=\"32\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(collectionName)+"\"/>\n"+
" </td>\n"+
" </tr>\n"+
"</table>\n"
);
}
else
{
// Hiddens for collections
out.print(
"<input type=\"hidden\" name=\""+seqPrefix+"gts_collectionname\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(collectionName)+"\"/>\n"
);
}
// Template tab
if (tabName.equals(Messages.getString(locale,"GTSConnector.GTSTemplate")) && connectionSequenceNumber == actualSequenceNumber)
{
out.print(
"<table class=\"displaytable\">\n"+
" <tr><td class=\"separator\" colspan=\"2\"><hr/></td></tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"GTSConnector.DocumentTemplate") + "</nobr></td>\n"+
" <td class=\"value\">\n"+
" <textarea rows=\"10\" cols=\"96\" name=\""+seqPrefix+"gts_documenttemplate\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(documentTemplate)+"</textarea>\n"+
" </td>\n"+
" </tr>\n"+
"</table>\n"
);
}
else
{
// Hiddens for document template
out.print(
"<input type=\"hidden\" name=\""+seqPrefix+"gts_documenttemplate\" value=\""+org.apache.manifoldcf.ui.util.Encoder.attributeEscape(documentTemplate)+"\"/>\n"
);
}
}
/** Process a specification post.
* This method is called at the start of job's edit or view page, whenever there is a possibility that form data for a connection has been
* posted. Its purpose is to gather form information and modify the transformation specification accordingly.
* The name of the posted form is "editjob".
*@param variableContext contains the post data, including binary file-upload information.
*@param locale is the preferred local of the output.
*@param os is the current pipeline specification for this job.
*@param connectionSequenceNumber is the unique number of this connection within the job.
*@return null if all is well, or a string error message if there is an error that should prevent saving of the job (and cause a redirection to an error page).
*/
@Override
public String processSpecificationPost(IPostParameters variableContext, Locale locale, Specification os,
int connectionSequenceNumber)
throws ManifoldCFException
{
String seqPrefix = "s"+connectionSequenceNumber+"_";
// Collection name
String collectionName = variableContext.getParameter(seqPrefix+"gts_collectionname");
if (collectionName != null)
{
int i = 0;
while (i < os.getChildCount())
{
SpecificationNode sn = os.getChild(i);
if (sn.getType().equals(org.apache.manifoldcf.agents.output.gts.GTSConfig.NODE_COLLECTION))
os.removeChild(i);
else
i++;
}
if (collectionName.length() > 0)
{
SpecificationNode newspec = new SpecificationNode(org.apache.manifoldcf.agents.output.gts.GTSConfig.NODE_COLLECTION);
newspec.setAttribute(org.apache.manifoldcf.agents.output.gts.GTSConfig.ATTRIBUTE_VALUE,collectionName);
os.addChild(os.getChildCount(),newspec);
}
}
// Document template
String documentTemplate = variableContext.getParameter(seqPrefix+"gts_documenttemplate");
if (documentTemplate != null)
{
int i = 0;
while (i < os.getChildCount())
{
SpecificationNode sn = os.getChild(i);
if (sn.getType().equals(org.apache.manifoldcf.agents.output.gts.GTSConfig.NODE_DOCUMENTTEMPLATE))
os.removeChild(i);
else
i++;
}
SpecificationNode newspec = new SpecificationNode(org.apache.manifoldcf.agents.output.gts.GTSConfig.NODE_DOCUMENTTEMPLATE);
newspec.setAttribute(org.apache.manifoldcf.agents.output.gts.GTSConfig.ATTRIBUTE_VALUE,documentTemplate);
os.addChild(os.getChildCount(),newspec);
}
return null;
}
/** View specification.
* This method is called in the body section of a job's view page. Its purpose is to present the pipeline specification information to the user.
* The coder can presume that the HTML that is output from this configuration will be within appropriate <html> and <body> tags.
*@param out is the output to which any HTML should be sent.
*@param locale is the preferred local of the output.
*@param connectionSequenceNumber is the unique number of this connection within the job.
*@param os is the current pipeline specification for this job.
*/
@Override
public void viewSpecification(IHTTPOutput out, Locale locale, Specification os,
int connectionSequenceNumber)
throws ManifoldCFException, IOException
{
int i = 0;
String collectionName = null;
String documentTemplate = null;
while (i < os.getChildCount())
{
SpecificationNode sn = os.getChild(i++);
if (sn.getType().equals(org.apache.manifoldcf.agents.output.gts.GTSConfig.NODE_COLLECTION))
{
collectionName = sn.getAttributeValue(org.apache.manifoldcf.agents.output.gts.GTSConfig.ATTRIBUTE_VALUE);
}
else if (sn.getType().equals(org.apache.manifoldcf.agents.output.gts.GTSConfig.NODE_DOCUMENTTEMPLATE))
{
documentTemplate = sn.getAttributeValue(org.apache.manifoldcf.agents.output.gts.GTSConfig.ATTRIBUTE_VALUE);
}
}
if (collectionName == null)
collectionName = "";
if (documentTemplate == null)
documentTemplate = "";
// Display collections
out.print(
"\n"+
"<table class=\"displaytable\">\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"GTSConnector.Collection") + "</nobr></td>\n"+
" <td class=\"value\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(collectionName)+"</td>\n"+
" </tr>\n"+
" <tr>\n"+
" <td class=\"description\"><nobr>" + Messages.getBodyString(locale,"GTSConnector.DocumentTemplate") + "</nobr></td>\n"+
" <td class=\"value\">\n"
);
if (documentTemplate == null || documentTemplate.length() == 0)
out.println("None specified");
else
{
out.print(
" <textarea name=\"documenttemplate\" cols=\"96\" rows=\"5\" readonly=\"true\">"+org.apache.manifoldcf.ui.util.Encoder.bodyEscape(documentTemplate)+"</textarea>\n"
);
}
out.print(
" </td>\n"+
" </tr>\n"+
"</table>\n"
);
}
// Protected methods
/** Fingerprint a file!
* Pass in the name of the (local) temporary file that we should be looking at.
* This method will read it as needed until the file has been identified (or found
* to remain "unknown").
* The code here has been lifted algorithmically from products/ShareCrawler/Fingerprinter.pas.
*/
protected static int fingerprint(File file)
throws ManifoldCFException
{
try
{
// Look at the first 4K
byte[] byteBuffer = new byte[4096];
int amt;
// Open file for reading.
InputStream is = new FileInputStream(file);
try
{
amt = 0;
while (amt < byteBuffer.length)
{
int incr = is.read(byteBuffer,amt,byteBuffer.length-amt);
if (incr == -1)
break;
amt += incr;
}
}
finally
{
is.close();
}
if (amt == 0)
return DT_ZERO;
if (isText(byteBuffer,amt))
{
// Treat as ASCII text
// We don't need to distinguish between the various flavors (e.g. HTML,
// XML, RTF, or plain TEXT, because GTS will eat them all regardless.
// Since it's a bit dicey to figure out the encoding, we'll just presume
// it's something that GTS will understand.
return DT_TEXT;
}
// Treat it as binary
// Is it PDF? Does it begin with "%PDF-"?
if (byteBuffer[0] == (byte)0x25 && byteBuffer[1] == (byte)0x50 && byteBuffer[2] == (byte)0x44 && byteBuffer[3] == (byte)0x46)
return DT_PDF;
// Is it a compound document? Does it begin with 0xD0CF11E0A1B11AE1?
if (Logging.ingest.isDebugEnabled())
Logging.ingest.debug("GTS: Document begins with: "+hexprint(byteBuffer[0])+hexprint(byteBuffer[1])+
hexprint(byteBuffer[2])+hexprint(byteBuffer[3])+hexprint(byteBuffer[4])+hexprint(byteBuffer[5])+
hexprint(byteBuffer[6])+hexprint(byteBuffer[7]));
if (byteBuffer[0] == (byte)0xd0 && byteBuffer[1] == (byte)0xcf && byteBuffer[2] == (byte)0x11 && byteBuffer[3] == (byte)0xe0 &&
byteBuffer[4] == (byte)0xa1 && byteBuffer[5] == (byte)0xb1 && byteBuffer[6] == (byte)0x1a && byteBuffer[7] == (byte)0xe1)
{
Logging.ingest.debug("GTS: Compound document signature detected");
// Figure out what kind of compound document it is.
String appName = getAppName(file);
if (appName == null)
return DT_UNKNOWN;
else
{
if (Logging.ingest.isDebugEnabled())
Logging.ingest.debug("GTS: Appname is '"+appName+"'");
}
return recognizeApp(appName);
}
return DT_UNKNOWN;
}
catch (java.net.SocketTimeoutException e)
{
return DT_UNKNOWN;
}
catch (InterruptedIOException e)
{
throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
catch (IOException e)
{
// An I/O error indicates that the type is unknown.
return DT_UNKNOWN;
}
catch (IllegalArgumentException e)
{
// Another POI error, means unknown document type
return DT_UNKNOWN;
}
catch (IllegalStateException e)
{
// Another POI error, means unknown document type
return DT_UNKNOWN;
}
catch (ArrayIndexOutOfBoundsException e)
{
// This means that poi couldn't find the bytes it was expecting, so just treat it as unknown
return DT_UNKNOWN;
}
catch (ClassCastException e)
{
// This means that poi had an internal error
return DT_UNKNOWN;
}
catch (OutOfMemoryError e)
{
// POI seems to throw this for some kinds of corrupt documents.
// I'm not sure this is the right thing to do but it's the best I
// can at the moment, until I get some documents from Norway that
// demonstrate the problem.
return DT_UNKNOWN;
}
}
/** Get a binary document's APPNAME field, or return null if the document
* does not seem to be an OLE compound document.
*/
protected static String getAppName(File documentPath)
throws ManifoldCFException
{
try
{
InputStream is = new FileInputStream(documentPath);
try
{
// Use POIFS to traverse the file
POIFSReader reader = new POIFSReader();
ReaderListener listener = new ReaderListener();
reader.registerListener(listener,"\u0005SummaryInformation");
reader.read(is);
if (Logging.ingest.isDebugEnabled())
Logging.ingest.debug("GTS: Done finding appname for '"+documentPath.toString()+"'");
return listener.getAppName();
}
finally
{
is.close();
}
}
catch (java.net.SocketTimeoutException e)
{
return null;
}
catch (InterruptedIOException e)
{
throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
catch (Throwable e)
{
// We should eat all errors. Also, even though our policy is to stop the crawler on out-of-memory errors, in this case we will
// not do that, because there's no "collateral damage" that can result from a fingerprinting failure. No locks can be dropped, and
// we cannot screw up the database driver.
// Any collateral damage that we *do* need to stop for should manifest itself in another thread.
// The exception effectively means that we cannot identify the document.
return null;
}
}
/** Translate a string application name to one of the kinds of documents
* we care about.
*/
protected static int recognizeApp(String appName)
{
appName = appName.toUpperCase(Locale.ROOT);
if (appName.indexOf("MICROSOFT WORD") != -1)
return DT_MSWORD;
if (appName.indexOf("MICROSOFT OFFICE WORD") != -1)
return DT_MSWORD;
if (appName.indexOf("MICROSOFT EXCEL") != -1)
return DT_MSEXCEL;
if (appName.indexOf("MICROSOFT POWERPOINT") != -1)
return DT_MSPOWERPOINT;
if (appName.indexOf("MICROSOFT OFFICE POWERPOINT") != -1)
return DT_MSPOWERPOINT;
if (appName.indexOf("MICROSOFT OUTLOOK") != -1)
return DT_MSOUTLOOK;
return DT_COMPOUND_DOC;
}
/** Test to see if a document is text or not. The first n bytes are passed
* in, and this code returns "true" if it thinks they represent text. The code
* has been lifted algorithmically from products/Sharecrawler/Fingerprinter.pas,
* which was based on "perldoc -f -T".
*/
protected static boolean isText(byte[] beginChunk, int chunkLength)
{
if (chunkLength == 0)
return true;
int i = 0;
int count = 0;
while (i < chunkLength)
{
byte x = beginChunk[i++];
if (x == 0)
return false;
if (isStrange(x))
count++;
}
return ((double)count)/((double)chunkLength) < 0.30;
}
/** Check if character is not typical ASCII. */
protected static boolean isStrange(byte x)
{
return (x > 127 || x < 32) && (!isWhiteSpace(x));
}
/** Check if a byte is a whitespace character. */
protected static boolean isWhiteSpace(byte x)
{
return (x == 0x09 || x == 0x0a || x == 0x0d || x == 0x20);
}
protected static String hexprint(byte x)
{
StringBuilder sb = new StringBuilder();
sb.append(nibbleprint(0x0f & (((int)x)>>4))).append(nibbleprint(0x0f & ((int)x)));
return sb.toString();
}
protected static char nibbleprint(int x)
{
if (x >= 10)
return (char)(x - 10 + 'a');
return (char)(x + '0');
}
/** Reader listener object that extracts the app name */
protected static class ReaderListener implements POIFSReaderListener
{
protected String appName = null;
/** Constructor. */
public ReaderListener()
{
}
/** Get the app name.
*/
public String getAppName()
{
return appName;
}
/** Process an "event" from POIFS - which is basically just the fact that we saw what we
* said we wanted to see, namely the SummaryInfo stream.
*/
public void processPOIFSReaderEvent(POIFSReaderEvent event)
{
// Catch exceptions
try
{
InputStream is = event.getStream();
try
{
PropertySet ps = PropertySetFactory.create(is);
if (!(ps instanceof SummaryInformation))
{
appName = null;
return;
}
appName = ((SummaryInformation)ps).getApplicationName();
}
finally
{
is.close();
}
}
catch (NoPropertySetStreamException e)
{
// This means we couldn't figure out what the application was
appName = null;
return;
}
catch (MarkUnsupportedException e)
{
// Bad code; need to suport mark operation.
Logging.ingest.error("Need to feed a stream that supports mark(): "+e.getMessage(),e);
appName = null;
return;
}
catch (java.io.UnsupportedEncodingException e)
{
// Bad code; need to support encoding properly
Logging.ingest.error("Need to support encoding: "+e.getMessage(),e);
appName = null;
return;
}
catch (IOException e)
{
appName = null;
return;
}
}
}
}