/* $Id$ */
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.agents.transformation.tikaservice;
import org.apache.manifoldcf.core.interfaces.*;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.TeeInputStream;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpPut;
import org.apache.http.entity.InputStreamEntity;
import org.apache.http.conn.HttpClientConnectionManager;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.protocol.HttpRequestExecutor;
import org.apache.http.impl.client.DefaultRedirectStrategy;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.config.SocketConfig;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.manifoldcf.agents.interfaces.*;
import org.apache.manifoldcf.agents.system.Logging;
import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.nio.charset.StandardCharsets;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
/**
* This connector works as a transformation connector, but does nothing other
* than logging.
*
*/
public class TikaExtractor extends org.apache.manifoldcf.agents.transformation.BaseTransformationConnector {
public static final String _rcsid = "@(#)$Id$";
private static final String EDIT_CONFIGURATION_JS = "editConfiguration.js";
private static final String EDIT_CONFIGURATION_SERVER_HTML = "editConfiguration_Server.html";
private static final String VIEW_CONFIGURATION_HTML = "viewConfiguration.html";
private static final String EDIT_SPECIFICATION_JS = "editSpecification.js";
private static final String EDIT_SPECIFICATION_FIELDMAPPING_HTML = "editSpecification_FieldMapping.html";
private static final String VIEW_SPECIFICATION_HTML = "viewSpecification.html";
protected static final String ACTIVITY_EXTRACT = "extract";
protected static final String[] activitiesList = new String[] { ACTIVITY_EXTRACT };
protected final static long sessionExpirationInterval = 300000L;
/** We handle up to 64K in memory; after that we go to disk. */
protected static final long inMemoryMaximumFile = 65536;
// Raw parameters
/** Tika host name */
private String tikaHostname = null;
/** Tika port */
private String tikaPortString = null;
// Computed parameters
/** Session timeout */
private long sessionTimeout = -1L;
/** Tika port */
private int tikaPort = -1;
/** Connection manager */
private HttpClientConnectionManager connectionManager = null;
/** Httpclient instance */
private HttpClient httpClient = null;
/** HttpHost */
private HttpHost tikaHost = null;
// Static data
/** Metadata URI */
protected final static URI metaURI;
/** Content URI */
protected final static URI contentURI;
/** Check URI */
protected final static URI checkURI;
static {
try {
metaURI = new URI("/meta");
contentURI = new URI("/tika");
checkURI = new URI("/detect/stream");
} catch (URISyntaxException e) {
throw new RuntimeException(e.getMessage());
}
}
/** Connect.
*@param configParameters is the set of configuration parameters, which
* in this case describe the root directory.
*/
@Override
public void connect(ConfigParams configParameters)
{
super.connect(configParameters);
tikaHostname = configParameters.getParameter(TikaConfig.PARAM_TIKAHOSTNAME);
tikaPortString = configParameters.getParameter(TikaConfig.PARAM_TIKAPORT);
}
/** Close the connection. Call this before discarding the repository connector.
*/
@Override
public void disconnect()
throws ManifoldCFException
{
expireSession();
tikaHostname = null;
tikaPortString = null;
super.disconnect();
}
/** This method is periodically called for all connectors that are connected but not
* in active use.
*/
@Override
public void poll()
throws ManifoldCFException
{
if (System.currentTimeMillis() >= sessionTimeout)
{
expireSession();
}
if (connectionManager != null)
connectionManager.closeIdleConnections(60000L,TimeUnit.MILLISECONDS);
}
/** This method is called to assess whether to count this connector instance should
* actually be counted as being connected.
*@return true if the connector instance is actually connected.
*/
@Override
public boolean isConnected()
{
return sessionTimeout != -1L;
}
/** Set up a session */
protected void getSession()
throws ManifoldCFException
{
if (sessionTimeout == -1L)
{
if (tikaHostname == null || tikaHostname.length() == 0) {
throw new ManifoldCFException("Missing host name");
}
if (tikaPortString == null) {
throw new ManifoldCFException("Missing port value");
}
try {
this.tikaPort = Integer.parseInt(tikaPortString);
} catch (NumberFormatException e) {
throw new ManifoldCFException("Bad port number: "+tikaPortString);
}
final int connectionTimeout = 60000;
final int socketTimeout = 900000;
final PoolingHttpClientConnectionManager poolingConnectionManager = new PoolingHttpClientConnectionManager(RegistryBuilder.<ConnectionSocketFactory>create()
.register("http", PlainConnectionSocketFactory.getSocketFactory())
//.register("https", myFactory)
.build());
poolingConnectionManager.setDefaultMaxPerRoute(1);
poolingConnectionManager.setValidateAfterInactivity(2000);
poolingConnectionManager.setDefaultSocketConfig(SocketConfig.custom()
.setTcpNoDelay(true)
.setSoTimeout(socketTimeout)
.build());
this.connectionManager = poolingConnectionManager;
final RequestConfig.Builder requestBuilder = RequestConfig.custom()
.setCircularRedirectsAllowed(true)
.setSocketTimeout(socketTimeout)
.setExpectContinueEnabled(false)
.setConnectTimeout(connectionTimeout)
.setConnectionRequestTimeout(socketTimeout);
final HttpClientBuilder builder = HttpClients.custom()
.setConnectionManager(connectionManager)
.disableAutomaticRetries()
.setDefaultRequestConfig(requestBuilder.build());
builder.setRequestExecutor(new HttpRequestExecutor(socketTimeout))
.setRedirectStrategy(new DefaultRedirectStrategy());
this.httpClient = builder.build();
this.tikaHost = new HttpHost(tikaHostname, tikaPort);
}
sessionTimeout = System.currentTimeMillis() + sessionExpirationInterval;
}
/** Expire the current session */
protected void expireSession()
throws ManifoldCFException
{
tikaPort = -1;
httpClient = null;
tikaHost = null;
if (connectionManager != null)
connectionManager.shutdown();
connectionManager = null;
sessionTimeout = -1L;
}
/** Test the connection. Returns a string describing the connection integrity.
*@return the connection's status as a displayable string.
*/
@Override
public String check()
throws ManifoldCFException
{
getSession();
final HttpPut httpPut = new HttpPut(metaURI);
httpPut.addHeader("Accept", "application/json");
final HttpEntity entity = new InputStreamEntity(new ByteArrayInputStream("this is a test".getBytes(StandardCharsets.UTF_8)));
httpPut.setEntity(entity);
HttpResponse response;
try {
response = this.httpClient.execute(tikaHost, httpPut);
} catch (IOException e) {
return "Connection error: "+e.getMessage();
}
final int responseCode = response.getStatusLine().getStatusCode();
if (response.getStatusLine().getStatusCode() != 200) {
return "Bad response: "+response.getStatusLine();
}
return super.check();
}
/**
* Return a list of activities that this connector generates. The connector
* does NOT need to be connected before this method is called.
*
* @return the set of activities.
*/
@Override
public String[] getActivitiesList() {
return activitiesList;
}
/** Output the configuration header section.
* This method is called in the head section of the connector's configuration page. Its purpose is to add the required tabs to the list, and to output any
* javascript methods that might be needed by the configuration editing HTML.
*@param threadContext is the local thread context.
*@param out is the output to which any HTML should be sent.
*@param parameters are the configuration parameters, as they currently exist, for this connection being configured.
*@param tabsArray is an array of tab names. Add to this array any tab names that are specific to the connector.
*/
@Override
public void outputConfigurationHeader(IThreadContext threadContext, IHTTPOutput out,
Locale locale, ConfigParams parameters, List<String> tabsArray)
throws ManifoldCFException, IOException
{
tabsArray.add(Messages.getString(locale,"TikaExtractor.TikaServerTabName"));
Messages.outputResourceWithVelocity(out,locale,EDIT_CONFIGURATION_JS,null);
}
/** Output the configuration body section.
* This method is called in the body section of the connector's configuration page. Its purpose is to present the required form elements for editing.
* The coder can presume that the HTML that is output from this configuration will be within appropriate <html>, <body>, and <form> tags. The name of the
* form is "editconnection".
*@param threadContext is the local thread context.
*@param out is the output to which any HTML should be sent.
*@param parameters are the configuration parameters, as they currently exist, for this connection being configured.
*@param tabName is the current tab name.
*/
@Override
public void outputConfigurationBody(IThreadContext threadContext, IHTTPOutput out,
Locale locale, ConfigParams parameters, String tabName)
throws ManifoldCFException, IOException
{
Map<String,Object> velocityContext = new HashMap<String,Object>();
velocityContext.put("TabName",tabName);
fillInServerTab(velocityContext,out,parameters);
Messages.outputResourceWithVelocity(out,locale,EDIT_CONFIGURATION_SERVER_HTML,velocityContext);
}
/** Process a configuration post.
* This method is called at the start of the connector's configuration page, whenever there is a possibility that form data for a connection has been
* posted. Its purpose is to gather form information and modify the configuration parameters accordingly.
* The name of the posted form is "editconnection".
*@param threadContext is the local thread context.
*@param variableContext is the set of variables available from the post, including binary file post information.
*@param parameters are the configuration parameters, as they currently exist, for this connection being configured.
*@return null if all is well, or a string error message if there is an error that should prevent saving of the connection (and cause a redirection to an error page).
*/
@Override
public String processConfigurationPost(IThreadContext threadContext, IPostParameters variableContext,
Locale locale, ConfigParams parameters)
throws ManifoldCFException
{
final String tikaHostname = variableContext.getParameter("tikaHostname");
if (tikaHostname != null)
parameters.setParameter(TikaConfig.PARAM_TIKAHOSTNAME,tikaHostname);
final String tikaPort = variableContext.getParameter("tikaPort");
if (tikaPort != null)
parameters.setParameter(TikaConfig.PARAM_TIKAPORT,tikaPort);
return null;
}
/** View configuration.
* This method is called in the body section of the connector's view configuration page. Its purpose is to present the connection information to the user.
* The coder can presume that the HTML that is output from this configuration will be within appropriate <html> and <body> tags.
*@param threadContext is the local thread context.
*@param out is the output to which any HTML should be sent.
*@param parameters are the configuration parameters, as they currently exist, for this connection being configured.
*/
@Override
public void viewConfiguration(IThreadContext threadContext, IHTTPOutput out,
Locale locale, ConfigParams parameters)
throws ManifoldCFException, IOException
{
Map<String,Object> velocityContext = new HashMap<String,Object>();
fillInServerTab(velocityContext,out,parameters);
Messages.outputResourceWithVelocity(out,locale,VIEW_CONFIGURATION_HTML,velocityContext);
}
protected static void fillInServerTab(Map<String,Object> velocityContext, IHTTPOutput out, ConfigParams parameters)
throws ManifoldCFException
{
String tikaHostname = parameters.getParameter(TikaConfig.PARAM_TIKAHOSTNAME);
if (tikaHostname == null)
tikaHostname = TikaConfig.TIKAHOSTNAME_DEFAULT;
String tikaPort = parameters.getParameter(TikaConfig.PARAM_TIKAPORT);
if (tikaPort == null)
tikaPort = TikaConfig.TIKAPORT_DEFAULT;
// Fill in context
velocityContext.put("TIKAHOSTNAME", tikaHostname);
velocityContext.put("TIKAPORT", tikaPort);
}
/**
* Get an output version string, given an output specification. The output
* version string is used to uniquely describe the pertinent details of the
* output specification and the configuration, to allow the Connector
* Framework to determine whether a document will need to be output again.
* Note that the contents of the document cannot be considered by this method,
* and that a different version string (defined in IRepositoryConnector) is
* used to describe the version of the actual document.
*
* This method presumes that the connector object has been configured, and it
* is thus able to communicate with the output data store should that be
* necessary.
*
* @param os
* is the current output specification for the job that is doing the
* crawling.
* @return a string, of unlimited length, which uniquely describes output
* configuration and specification in such a way that if two such
* strings are equal, the document will not need to be sent again to
* the output data store.
*/
@Override
public VersionContext getPipelineDescription(Specification os) throws ManifoldCFException, ServiceInterruption {
SpecPacker sp = new SpecPacker(os);
return new VersionContext(sp.toPackedString(), params, os);
}
// We intercept checks pertaining to the document format and send modified
// checks further down
/**
* Detect if a mime type is acceptable or not. This method is used to
* determine whether it makes sense to fetch a document in the first place.
*
* @param pipelineDescription
* is the document's pipeline version string, for this connection.
* @param mimeType
* is the mime type of the document.
* @param checkActivity
* is an object including the activities that can be performed by
* this method.
* @return true if the mime type can be accepted by this connector.
*/
@Override
public boolean checkMimeTypeIndexable(VersionContext pipelineDescription, String mimeType,
IOutputCheckActivity checkActivity) throws ManifoldCFException, ServiceInterruption {
// We should see what Tika will transform
// MHL
// Do a downstream check
return checkActivity.checkMimeTypeIndexable("text/plain;charset=utf-8");
}
/**
* Pre-determine whether a document (passed here as a File object) is
* acceptable or not. This method is used to determine whether a document
* needs to be actually transferred. This hook is provided mainly to support
* search engines that only handle a small set of accepted file types.
*
* @param pipelineDescription
* is the document's pipeline version string, for this connection.
* @param localFile
* is the local file to check.
* @param checkActivity
* is an object including the activities that can be done by this
* method.
* @return true if the file is acceptable, false if not.
*/
@Override
public boolean checkDocumentIndexable(VersionContext pipelineDescription, File localFile,
IOutputCheckActivity checkActivity) throws ManifoldCFException, ServiceInterruption {
// Document contents are not germane anymore, unless it looks like Tika
// won't accept them.
// Not sure how to check that...
return true;
}
/**
* Pre-determine whether a document's length is acceptable. This method is
* used to determine whether to fetch a document in the first place.
*
* @param pipelineDescription
* is the document's pipeline version string, for this connection.
* @param length
* is the length of the document.
* @param checkActivity
* is an object including the activities that can be done by this
* method.
* @return true if the file is acceptable, false if not.
*/
@Override
public boolean checkLengthIndexable(VersionContext pipelineDescription, long length,
IOutputCheckActivity checkActivity) throws ManifoldCFException, ServiceInterruption {
// Always true
return true;
}
/**
* Add (or replace) a document in the output data store using the connector.
* This method presumes that the connector object has been configured, and it
* is thus able to communicate with the output data store should that be
* necessary. The OutputSpecification is *not* provided to this method,
* because the goal is consistency, and if output is done it must be
* consistent with the output description, since that was what was partly used
* to determine if output should be taking place. So it may be necessary for
* this method to decode an output description string in order to determine
* what should be done.
*
* @param documentURI
* is the URI of the document. The URI is presumed to be the unique
* identifier which the output data store will use to process and
* serve the document. This URI is constructed by the repository
* connector which fetches the document, and is thus universal across
* all output connectors.
* @param outputDescription
* is the description string that was constructed for this document
* by the getOutputDescription() method.
* @param document
* is the document data to be processed (handed to the output data
* store).
* @param authorityNameString
* is the name of the authority responsible for authorizing any
* access tokens passed in with the repository document. May be null.
* @param activities
* is the handle to an object that the implementer of a pipeline
* connector may use to perform operations, such as logging
* processing activity, or sending a modified document to the next
* stage in the pipeline.
* @return the document status (accepted or permanently rejected).
* @throws IOException
* only if there's a stream error reading the document data.
*/
@Override
public int addOrReplaceDocumentWithException(String documentURI, VersionContext pipelineDescription,
RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
throws ManifoldCFException, ServiceInterruption, IOException {
// First, make sure downstream pipeline will now accept
// text/plain;charset=utf-8
if (!activities.checkMimeTypeIndexable("text/plain;charset=utf-8")) {
activities.noDocument();
activities.recordActivity(null, ACTIVITY_EXTRACT, null, documentURI, activities.EXCLUDED_MIMETYPE,
"Downstream pipeline rejected mime type 'text/plain;charset=utf-8'");
return DOCUMENTSTATUS_REJECTED;
}
SpecPacker sp = new SpecPacker(pipelineDescription.getSpecification());
getSession();
// Tika server variables
String mime = "";
InputStream tikaServerIs = null;
HttpResponse response = null;
IOException tikaServerDownException = null;
// Tika's API reads from an input stream and writes to an output Writer.
// Since a RepositoryDocument includes readers and inputstreams exclusively,
// AND all downstream
// processing needs to occur in a ManifoldCF thread, we have some
// constraints on the architecture we need to get this done:
// (1) The principle worker thread must call the downstream pipeline send()
// method.
// (2) The callee of the send() method must call a reader in the Repository
// Document.
// (3) The Reader, if its databuffer is empty, must pull more data from the
// original input stream and hand it to Tika, which populates the Reader's
// databuffer.
// So all this can be done in one thread, with some work, and the creation
// of a special InputStream or Reader implementation. Where it fails,
// though, is the
// requirement that tika-extracted metadata be included in the
// RepositoryDocument right from the beginning. Effectively this means that
// the entire document
// must be parsed before it is handed downstream -- so basically a temporary
// file (or in-memory buffer if small enough) must be created.
// Instead of the elegant flow above, we have the following:
// (1) Create a temporary file (or in-memory buffer if file is small enough)
// (2) Run Tika to completion, streaming content output to temporary file
// (3) Modify RepositoryDocument to read from temporary file, and include
// Tika-extracted metadata
// (4) Call downstream document processing
final DestinationStorage ds;
if (document.getBinaryLength() <= inMemoryMaximumFile) {
ds = new MemoryDestinationStorage((int) document.getBinaryLength());
} else {
ds = new FileDestinationStorage();
}
try {
DestinationStorage responseDs = null;
try {
Metadata metadata = new Metadata();
if (document.getFileName() != null) {
metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, document.getFileName());
metadata.add("stream_name", document.getFileName());
}
if (document.getMimeType() != null) {
mime = document.getMimeType();
metadata.add("Content-Type", mime);
}
metadata.add("stream_size", new Long(document.getBinaryLength()).toString());
// We only log the extraction
long startTime = System.currentTimeMillis();
String resultCode = "OK";
String description = null;
Long length = null;
try {
try {
// Make a copy of the original stream as it needs to be sent two
// times to Tika
// one for the metadata and one for the content
final OutputStream os = ds.getOutputStream();
try {
IOUtils.copyLarge(document.getBinaryStream(), os);
} finally {
os.close();
}
// Metadata
HttpPut httpPut = new HttpPut(metaURI);
if (!mime.isEmpty()) {
httpPut.addHeader("Content-Type", mime);
}
httpPut.addHeader("Accept", "application/json");
HttpEntity entity = new InputStreamEntity(ds.getInputStream());
httpPut.setEntity(entity);
try {
//System.out.println("About to PUT");
response = this.httpClient.execute(tikaHost, httpPut);
//System.out.println("PUT successful");
} catch (IOException e) {
// Retry 3 times, 10000 ms between retries, and abort if doesn't work
final long currentTime = System.currentTimeMillis();
throw new ServiceInterruption("Tika down, retrying: "+e.getMessage(),e,currentTime + 10000L,
-1L,3,true);
}
int responseCode = response.getStatusLine().getStatusCode();
if (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 204) {
tikaServerIs = response.getEntity().getContent();
try {
final BufferedReader br = new BufferedReader(new InputStreamReader(tikaServerIs));
final JSONParser parser = new JSONParser();
JSONObject metaJson;
final StringBuilder sb = new StringBuilder();
String output;
while ((output = br.readLine()) != null) {
sb.append(output);
}
metaJson = (JSONObject) parser.parse(sb.toString());
for (Object key : metaJson.keySet()) {
metadata.add(key.toString(), metaJson.get(key).toString());
}
} finally {
tikaServerIs.close();
}
} else {
activities.noDocument();
if (responseCode == 422) {
resultCode = "TIKASERVERREJECTS";
description = "Tika Server rejected document with the following reason: "
+ response.getStatusLine().getReasonPhrase();
handleTikaServerRejects(description);
} else {
resultCode = "TIKASERVERERROR";
description = "Tika Server failed to parse document with the following error: "
+ response.getStatusLine().getReasonPhrase();
handleTikaServerError(description);
}
return DOCUMENTSTATUS_REJECTED;
}
// Content
httpPut = new HttpPut(contentURI);
if (!mime.isEmpty()) {
httpPut.addHeader("Content-Type", mime);
}
httpPut.addHeader("Accept", "text/plain");
entity = new InputStreamEntity(ds.getInputStream());
httpPut.setEntity(entity);
try {
//System.out.println("About to do a content PUT");
response = this.httpClient.execute(tikaHost, httpPut);
//System.out.println("... content PUT succeeded");
} catch (IOException e) {
// Retry 3 times, 10000 ms between retries, and abort if doesn't work
final long currentTime = System.currentTimeMillis();
throw new ServiceInterruption("Tika down, retrying: "+e.getMessage(),e,currentTime + 10000L,
-1L,3,true);
}
responseCode = response.getStatusLine().getStatusCode();
if (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 204) {
tikaServerIs = response.getEntity().getContent();
try {
responseDs = new FileDestinationStorage();
final OutputStream os2 = responseDs.getOutputStream();
try {
IOUtils.copyLarge(tikaServerIs, os2, 0L, sp.writeLimit);
} finally {
os2.close();
}
length = new Long(responseDs.getBinaryLength());
} finally {
tikaServerIs.close();
}
} else {
activities.noDocument();
if (responseCode == 422) {
resultCode = "TIKASERVERREJECTS";
description = "Tika Server rejected document with the following reason: "
+ response.getStatusLine().getReasonPhrase();
return handleTikaServerRejects(description);
} else {
resultCode = "TIKASERVERERROR";
description = "Tika Server failed to parse document with the following error: "
+ response.getStatusLine().getReasonPhrase();
return handleTikaServerError(description);
}
}
} catch (IOException | ParseException e) {
resultCode = "TIKASERVERRESPONSEISSUE";
description = e.getMessage();
int rval;
if (e instanceof IOException) {
rval = handleTikaServerException((IOException) e);
} else {
rval = handleTikaServerException((ParseException) e);
}
if (rval == DOCUMENTSTATUS_REJECTED) {
activities.noDocument();
}
return rval;
}
if (!activities.checkLengthIndexable(responseDs.getBinaryLength())) {
activities.noDocument();
resultCode = activities.EXCLUDED_LENGTH;
description = "Downstream pipeline rejected document with length " + ds.getBinaryLength();
return DOCUMENTSTATUS_REJECTED;
}
} finally {
// Log the extraction processing
activities.recordActivity(new Long(startTime), ACTIVITY_EXTRACT, length, documentURI, resultCode, description);
}
// Parsing complete!
// Create a copy of Repository Document
final RepositoryDocument docCopy = document.duplicate();
// Get new stream length
final long newBinaryLength = responseDs.getBinaryLength();
// Open new input stream
final InputStream is = responseDs.getInputStream();
try {
docCopy.setBinary(is, newBinaryLength);
// Set up all metadata from Tika. We may want to run this through a
// mapper eventually...
String[] metaNames = metadata.names();
for (String mName : metaNames) {
String value = metadata.get(mName);
if (sp.lowerNames()) {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < mName.length(); i++) {
char ch = mName.charAt(i);
if (!Character.isLetterOrDigit(ch))
ch = '_';
else
ch = Character.toLowerCase(ch);
sb.append(ch);
}
mName = sb.toString();
}
String target = sp.getMapping(mName);
if (target != null) {
docCopy.addField(target, value);
} else {
if (sp.keepAllMetadata()) {
docCopy.addField(mName, value);
}
}
}
// Send new document downstream
return activities.sendDocument(documentURI, docCopy);
} finally {
is.close();
}
} finally {
if (responseDs != null) {
responseDs.close();
}
}
} finally {
ds.close();
}
}
/**
* Obtain the name of the form check javascript method to call.
*
* @param connectionSequenceNumber
* is the unique number of this connection within the job.
* @return the name of the form check javascript method.
*/
@Override
public String getFormCheckJavascriptMethodName(int connectionSequenceNumber) {
return "s" + connectionSequenceNumber + "_checkSpecification";
}
/**
* Obtain the name of the form presave check javascript method to call.
*
* @param connectionSequenceNumber
* is the unique number of this connection within the job.
* @return the name of the form presave check javascript method.
*/
@Override
public String getFormPresaveCheckJavascriptMethodName(int connectionSequenceNumber) {
return "s" + connectionSequenceNumber + "_checkSpecificationForSave";
}
/**
* Output the specification header section. This method is called in the head
* section of a job page which has selected a pipeline connection of the
* current type. Its purpose is to add the required tabs to the list, and to
* output any javascript methods that might be needed by the job editing HTML.
*
* @param out
* is the output to which any HTML should be sent.
* @param locale
* is the preferred local of the output.
* @param os
* is the current pipeline specification for this connection.
* @param connectionSequenceNumber
* is the unique number of this connection within the job.
* @param tabsArray
* is an array of tab names. Add to this array any tab names that are
* specific to the connector.
*/
@Override
public void outputSpecificationHeader(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber,
List<String> tabsArray) throws ManifoldCFException, IOException {
Map<String, Object> paramMap = new HashMap<String, Object>();
paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
tabsArray.add(Messages.getString(locale, "TikaExtractor.FieldMappingTabName"));
// Fill in the specification header map, using data from all tabs.
fillInFieldMappingSpecificationMap(paramMap, os);
Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_JS, paramMap);
}
/**
* Output the specification body section. This method is called in the body
* section of a job page which has selected a pipeline connection of the
* current type. Its purpose is to present the required form elements for
* editing. The coder can presume that the HTML that is output from this
* configuration will be within appropriate <html>, <body>, and <form> tags.
* The name of the form is "editjob".
*
* @param out
* is the output to which any HTML should be sent.
* @param locale
* is the preferred local of the output.
* @param os
* is the current pipeline specification for this job.
* @param connectionSequenceNumber
* is the unique number of this connection within the job.
* @param actualSequenceNumber
* is the connection within the job that has currently been selected.
* @param tabName
* is the current tab name.
*/
@Override
public void outputSpecificationBody(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber,
int actualSequenceNumber, String tabName) throws ManifoldCFException, IOException {
Map<String, Object> paramMap = new HashMap<String, Object>();
// Set the tab name
paramMap.put("TABNAME", tabName);
paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
paramMap.put("SELECTEDNUM", Integer.toString(actualSequenceNumber));
// Fill in the field mapping tab data
fillInFieldMappingSpecificationMap(paramMap, os);
Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_FIELDMAPPING_HTML, paramMap);
}
/**
* Process a specification post. This method is called at the start of job's
* edit or view page, whenever there is a possibility that form data for a
* connection has been posted. Its purpose is to gather form information and
* modify the transformation specification accordingly. The name of the posted
* form is "editjob".
*
* @param variableContext
* contains the post data, including binary file-upload information.
* @param locale
* is the preferred local of the output.
* @param os
* is the current pipeline specification for this job.
* @param connectionSequenceNumber
* is the unique number of this connection within the job.
* @return null if all is well, or a string error message if there is an error
* that should prevent saving of the job (and cause a redirection to
* an error page).
*/
@Override
public String processSpecificationPost(IPostParameters variableContext, Locale locale, Specification os,
int connectionSequenceNumber) throws ManifoldCFException {
String seqPrefix = "s" + connectionSequenceNumber + "_";
String x;
x = variableContext.getParameter(seqPrefix + "fieldmapping_count");
if (x != null && x.length() > 0) {
// About to gather the fieldmapping nodes, so get rid of the old ones.
int i = 0;
while (i < os.getChildCount()) {
SpecificationNode node = os.getChild(i);
if (node.getType().equals(TikaConfig.NODE_FIELDMAP) || node.getType().equals(TikaConfig.NODE_KEEPMETADATA)
|| node.getType().equals(TikaConfig.NODE_LOWERNAMES) || node.getType().equals(TikaConfig.NODE_WRITELIMIT))
os.removeChild(i);
else
i++;
}
int count = Integer.parseInt(x);
i = 0;
while (i < count) {
String prefix = seqPrefix + "fieldmapping_";
String suffix = "_" + Integer.toString(i);
String op = variableContext.getParameter(prefix + "op" + suffix);
if (op == null || !op.equals("Delete")) {
// Gather the fieldmap etc.
String source = variableContext.getParameter(prefix + "source" + suffix);
String target = variableContext.getParameter(prefix + "target" + suffix);
if (target == null)
target = "";
SpecificationNode node = new SpecificationNode(TikaConfig.NODE_FIELDMAP);
node.setAttribute(TikaConfig.ATTRIBUTE_SOURCE, source);
node.setAttribute(TikaConfig.ATTRIBUTE_TARGET, target);
os.addChild(os.getChildCount(), node);
}
i++;
}
String addop = variableContext.getParameter(seqPrefix + "fieldmapping_op");
if (addop != null && addop.equals("Add")) {
String source = variableContext.getParameter(seqPrefix + "fieldmapping_source");
String target = variableContext.getParameter(seqPrefix + "fieldmapping_target");
if (target == null)
target = "";
SpecificationNode node = new SpecificationNode(TikaConfig.NODE_FIELDMAP);
node.setAttribute(TikaConfig.ATTRIBUTE_SOURCE, source);
node.setAttribute(TikaConfig.ATTRIBUTE_TARGET, target);
os.addChild(os.getChildCount(), node);
}
// Gather the keep all metadata parameter to be the last one
SpecificationNode node = new SpecificationNode(TikaConfig.NODE_KEEPMETADATA);
String keepAll = variableContext.getParameter(seqPrefix + "keepallmetadata");
if (keepAll != null) {
node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, keepAll);
} else {
node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false");
}
// Add the new keepallmetadata config parameter
os.addChild(os.getChildCount(), node);
SpecificationNode node2 = new SpecificationNode(TikaConfig.NODE_LOWERNAMES);
String lower = variableContext.getParameter(seqPrefix + "lowernames");
if (lower != null) {
node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, lower);
} else {
node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false");
}
os.addChild(os.getChildCount(), node2);
SpecificationNode node3 = new SpecificationNode(TikaConfig.NODE_WRITELIMIT);
String writeLimit = variableContext.getParameter(seqPrefix + "writelimit");
if (writeLimit != null) {
node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, writeLimit);
} else {
node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "");
}
os.addChild(os.getChildCount(), node3);
}
return null;
}
/**
* View specification. This method is called in the body section of a job's
* view page. Its purpose is to present the pipeline specification information
* to the user. The coder can presume that the HTML that is output from this
* configuration will be within appropriate <html> and <body> tags.
*
* @param out
* is the output to which any HTML should be sent.
* @param locale
* is the preferred local of the output.
* @param connectionSequenceNumber
* is the unique number of this connection within the job.
* @param os
* is the current pipeline specification for this job.
*/
@Override
public void viewSpecification(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber)
throws ManifoldCFException, IOException {
Map<String, Object> paramMap = new HashMap<String, Object>();
paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
// Fill in the map with data from all tabs
fillInFieldMappingSpecificationMap(paramMap, os);
Messages.outputResourceWithVelocity(out, locale, VIEW_SPECIFICATION_HTML, paramMap);
}
protected static void fillInFieldMappingSpecificationMap(Map<String, Object> paramMap, Specification os) {
// Prep for field mappings
List<Map<String, String>> fieldMappings = new ArrayList<Map<String, String>>();
String keepAllMetadataValue = "true";
String lowernamesValue = "false";
String writeLimitValue = "";
for (int i = 0; i < os.getChildCount(); i++) {
SpecificationNode sn = os.getChild(i);
if (sn.getType().equals(TikaConfig.NODE_FIELDMAP)) {
String source = sn.getAttributeValue(TikaConfig.ATTRIBUTE_SOURCE);
String target = sn.getAttributeValue(TikaConfig.ATTRIBUTE_TARGET);
String targetDisplay;
if (target == null) {
target = "";
targetDisplay = "(remove)";
} else
targetDisplay = target;
Map<String, String> fieldMapping = new HashMap<String, String>();
fieldMapping.put("SOURCE", source);
fieldMapping.put("TARGET", target);
fieldMapping.put("TARGETDISPLAY", targetDisplay);
fieldMappings.add(fieldMapping);
} else if (sn.getType().equals(TikaConfig.NODE_KEEPMETADATA)) {
keepAllMetadataValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
} else if (sn.getType().equals(TikaConfig.NODE_LOWERNAMES)) {
lowernamesValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
} else if (sn.getType().equals(TikaConfig.NODE_WRITELIMIT)) {
writeLimitValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
}
}
paramMap.put("FIELDMAPPINGS", fieldMappings);
paramMap.put("KEEPALLMETADATA", keepAllMetadataValue);
paramMap.put("LOWERNAMES", lowernamesValue);
paramMap.put("WRITELIMIT", writeLimitValue);
}
protected static int handleTikaServerRejects(String reason)
throws IOException, ManifoldCFException, ServiceInterruption {
// MHL - what does Tika throw if it gets an IOException reading the stream??
Logging.ingest.warn("Tika Server: Tika Server rejects: " + reason);
return DOCUMENTSTATUS_REJECTED;
}
protected static int handleTikaServerError(String description)
throws IOException, ManifoldCFException, ServiceInterruption {
// MHL - what does Tika throw if it gets an IOException reading the stream??
Logging.ingest.warn("Tika Server: Tika Server error: " + description);
return DOCUMENTSTATUS_REJECTED;
}
protected static int handleTikaServerException(IOException e)
throws IOException, ManifoldCFException, ServiceInterruption {
// MHL - what does Tika throw if it gets an IOException reading the stream??
Logging.ingest.warn("Tika: Tika exception extracting: " + e.getMessage(), e);
return DOCUMENTSTATUS_REJECTED;
}
protected static int handleTikaServerException(ParseException e)
throws IOException, ManifoldCFException, ServiceInterruption {
// MHL - what does Tika throw if it gets an IOException reading the stream??
Logging.ingest.warn("Tika: Tika exception extracting: " + e.getMessage(), e);
return DOCUMENTSTATUS_REJECTED;
}
protected static int handleIOException(IOException e) throws ManifoldCFException {
// IOException reading from our local storage...
if (e instanceof InterruptedIOException)
throw new ManifoldCFException(e.getMessage(), e, ManifoldCFException.INTERRUPTED);
throw new ManifoldCFException(e.getMessage(), e);
}
protected static interface DestinationStorage {
/**
* Get the output stream to write to. Caller should explicitly close this
* stream when done writing.
*/
public OutputStream getOutputStream() throws ManifoldCFException;
/**
* Get new binary length.
*/
public long getBinaryLength() throws ManifoldCFException;
/**
* Get the input stream to read from. Caller should explicitly close this
* stream when done reading.
*/
public InputStream getInputStream() throws ManifoldCFException;
/**
* Close the object and clean up everything. This should be called when the
* data is no longer needed.
*/
public void close() throws ManifoldCFException;
}
protected static class FileDestinationStorage implements DestinationStorage {
protected final File outputFile;
protected final OutputStream outputStream;
public FileDestinationStorage() throws ManifoldCFException {
File outputFile;
OutputStream outputStream;
try {
outputFile = File.createTempFile("mcftika", "tmp");
outputStream = new FileOutputStream(outputFile);
} catch (IOException e) {
handleIOException(e);
outputFile = null;
outputStream = null;
}
this.outputFile = outputFile;
this.outputStream = outputStream;
}
@Override
public OutputStream getOutputStream() throws ManifoldCFException {
return outputStream;
}
/**
* Get new binary length.
*/
@Override
public long getBinaryLength() throws ManifoldCFException {
return outputFile.length();
}
/**
* Get the input stream to read from. Caller should explicitly close this
* stream when done reading.
*/
@Override
public InputStream getInputStream() throws ManifoldCFException {
try {
return new FileInputStream(outputFile);
} catch (IOException e) {
handleIOException(e);
return null;
}
}
/**
* Close the object and clean up everything. This should be called when the
* data is no longer needed.
*/
@Override
public void close() throws ManifoldCFException {
outputFile.delete();
}
}
protected static class MemoryDestinationStorage implements DestinationStorage {
protected final ByteArrayOutputStream outputStream;
public MemoryDestinationStorage(int sizeHint) {
outputStream = new ByteArrayOutputStream(sizeHint);
}
@Override
public OutputStream getOutputStream() throws ManifoldCFException {
return outputStream;
}
/**
* Get new binary length.
*/
@Override
public long getBinaryLength() throws ManifoldCFException {
return outputStream.size();
}
/**
* Get the input stream to read from. Caller should explicitly close this
* stream when done reading.
*/
@Override
public InputStream getInputStream() throws ManifoldCFException {
return new ByteArrayInputStream(outputStream.toByteArray());
}
/**
* Close the object and clean up everything. This should be called when the
* data is no longer needed.
*/
public void close() throws ManifoldCFException {
}
}
protected static class SpecPacker {
private final Map<String, String> sourceTargets = new HashMap<String, String>();
private final boolean keepAllMetadata;
private final boolean lowerNames;
private final int writeLimit;
public SpecPacker(Specification os) {
boolean keepAllMetadata = true;
boolean lowerNames = false;
int writeLimit = TikaConfig.WRITELIMIT_DEFAULT;
boolean ignoreTikaException = true;
for (int i = 0; i < os.getChildCount(); i++) {
SpecificationNode sn = os.getChild(i);
if (sn.getType().equals(TikaConfig.NODE_KEEPMETADATA)) {
String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
keepAllMetadata = Boolean.parseBoolean(value);
} else if (sn.getType().equals(TikaConfig.NODE_LOWERNAMES)) {
String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
lowerNames = Boolean.parseBoolean(value);
} else if (sn.getType().equals(TikaConfig.NODE_WRITELIMIT)) {
String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
if (value.length() == 0) {
writeLimit = TikaConfig.WRITELIMIT_DEFAULT;
} else {
writeLimit = Integer.parseInt(value);
}
} else if (sn.getType().equals(TikaConfig.NODE_FIELDMAP)) {
String source = sn.getAttributeValue(TikaConfig.ATTRIBUTE_SOURCE);
String target = sn.getAttributeValue(TikaConfig.ATTRIBUTE_TARGET);
if (target == null) {
target = "";
}
sourceTargets.put(source, target);
}
}
this.keepAllMetadata = keepAllMetadata;
this.lowerNames = lowerNames;
this.writeLimit = writeLimit;
}
public String toPackedString() {
StringBuilder sb = new StringBuilder();
int i;
// Mappings
final String[] sortArray = new String[sourceTargets.size()];
i = 0;
for (String source : sourceTargets.keySet()) {
sortArray[i++] = source;
}
java.util.Arrays.sort(sortArray);
List<String> packedMappings = new ArrayList<String>();
String[] fixedList = new String[2];
for (String source : sortArray) {
String target = sourceTargets.get(source);
StringBuilder localBuffer = new StringBuilder();
fixedList[0] = source;
fixedList[1] = target;
packFixedList(localBuffer, fixedList, ':');
packedMappings.add(localBuffer.toString());
}
packList(sb, packedMappings, '+');
// Keep all metadata
if (keepAllMetadata)
sb.append('+');
else
sb.append('-');
if (lowerNames)
sb.append('+');
else
sb.append('-');
if (writeLimit != TikaConfig.WRITELIMIT_DEFAULT) {
sb.append('+');
sb.append(writeLimit);
}
return sb.toString();
}
public String getMapping(String source) {
return sourceTargets.get(source);
}
public boolean keepAllMetadata() {
return keepAllMetadata;
}
public boolean lowerNames() {
return lowerNames;
}
public int writeLimit() {
return writeLimit;
}
}
}