/* $Id$ */ /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.manifoldcf.agents.transformation.tika; import org.apache.manifoldcf.core.interfaces.*; import org.apache.manifoldcf.agents.interfaces.*; import org.apache.manifoldcf.agents.system.Logging; import java.io.*; import java.util.*; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaMetadataKeys; import org.apache.tika.parser.html.BoilerpipeContentHandler; import de.l3s.boilerpipe.BoilerpipeExtractor; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; /** This connector works as a transformation connector, but does nothing other than logging. * */ public class TikaExtractor extends org.apache.manifoldcf.agents.transformation.BaseTransformationConnector { public static final String _rcsid = "@(#)$Id$"; private static final String EDIT_SPECIFICATION_JS = "editSpecification.js"; private static final String EDIT_SPECIFICATION_FIELDMAPPING_HTML = "editSpecification_FieldMapping.html"; private static final String EDIT_SPECIFICATION_EXCEPTIONS_HTML = "editSpecification_Exceptions.html"; private static final String EDIT_SPECIFICATION_BOILERPLATE_HTML = "editSpecification_Boilerplate.html"; private static final String VIEW_SPECIFICATION_HTML = "viewSpecification.html"; protected static final String ACTIVITY_EXTRACT = "extract"; protected static final String[] activitiesList = new String[]{ACTIVITY_EXTRACT}; /** We handle up to 64K in memory; after that we go to disk. */ protected static final long inMemoryMaximumFile = 65536; /** Return a list of activities that this connector generates. * The connector does NOT need to be connected before this method is called. *@return the set of activities. */ @Override public String[] getActivitiesList() { return activitiesList; } /** Get an output version string, given an output specification. The output version string is used to uniquely describe the pertinent details of * the output specification and the configuration, to allow the Connector Framework to determine whether a document will need to be output again. * Note that the contents of the document cannot be considered by this method, and that a different version string (defined in IRepositoryConnector) * is used to describe the version of the actual document. * * This method presumes that the connector object has been configured, and it is thus able to communicate with the output data store should that be * necessary. *@param os is the current output specification for the job that is doing the crawling. *@return a string, of unlimited length, which uniquely describes output configuration and specification in such a way that if two such strings are equal, * the document will not need to be sent again to the output data store. */ @Override public VersionContext getPipelineDescription(Specification os) throws ManifoldCFException, ServiceInterruption { SpecPacker sp = new SpecPacker(os); return new VersionContext(sp.toPackedString(),params,os); } // We intercept checks pertaining to the document format and send modified checks further down /** Detect if a mime type is acceptable or not. This method is used to determine whether it makes sense to fetch a document * in the first place. *@param pipelineDescription is the document's pipeline version string, for this connection. *@param mimeType is the mime type of the document. *@param checkActivity is an object including the activities that can be performed by this method. *@return true if the mime type can be accepted by this connector. */ @Override public boolean checkMimeTypeIndexable(VersionContext pipelineDescription, String mimeType, IOutputCheckActivity checkActivity) throws ManifoldCFException, ServiceInterruption { // We should see what Tika will transform // MHL // Do a downstream check return checkActivity.checkMimeTypeIndexable("text/plain;charset=utf-8"); } /** Pre-determine whether a document (passed here as a File object) is acceptable or not. This method is * used to determine whether a document needs to be actually transferred. This hook is provided mainly to support * search engines that only handle a small set of accepted file types. *@param pipelineDescription is the document's pipeline version string, for this connection. *@param localFile is the local file to check. *@param checkActivity is an object including the activities that can be done by this method. *@return true if the file is acceptable, false if not. */ @Override public boolean checkDocumentIndexable(VersionContext pipelineDescription, File localFile, IOutputCheckActivity checkActivity) throws ManifoldCFException, ServiceInterruption { // Document contents are not germane anymore, unless it looks like Tika won't accept them. // Not sure how to check that... return true; } /** Pre-determine whether a document's length is acceptable. This method is used * to determine whether to fetch a document in the first place. *@param pipelineDescription is the document's pipeline version string, for this connection. *@param length is the length of the document. *@param checkActivity is an object including the activities that can be done by this method. *@return true if the file is acceptable, false if not. */ @Override public boolean checkLengthIndexable(VersionContext pipelineDescription, long length, IOutputCheckActivity checkActivity) throws ManifoldCFException, ServiceInterruption { // Always true return true; } /** Add (or replace) a document in the output data store using the connector. * This method presumes that the connector object has been configured, and it is thus able to communicate with the output data store should that be * necessary. * The OutputSpecification is *not* provided to this method, because the goal is consistency, and if output is done it must be consistent with the * output description, since that was what was partly used to determine if output should be taking place. So it may be necessary for this method to decode * an output description string in order to determine what should be done. *@param documentURI is the URI of the document. The URI is presumed to be the unique identifier which the output data store will use to process * and serve the document. This URI is constructed by the repository connector which fetches the document, and is thus universal across all output connectors. *@param outputDescription is the description string that was constructed for this document by the getOutputDescription() method. *@param document is the document data to be processed (handed to the output data store). *@param authorityNameString is the name of the authority responsible for authorizing any access tokens passed in with the repository document. May be null. *@param activities is the handle to an object that the implementer of a pipeline connector may use to perform operations, such as logging processing activity, * or sending a modified document to the next stage in the pipeline. *@return the document status (accepted or permanently rejected). *@throws IOException only if there's a stream error reading the document data. */ @Override public int addOrReplaceDocumentWithException(String documentURI, VersionContext pipelineDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities) throws ManifoldCFException, ServiceInterruption, IOException { // First, make sure downstream pipeline will now accept text/plain;charset=utf-8 if (!activities.checkMimeTypeIndexable("text/plain;charset=utf-8")) { activities.noDocument(); activities.recordActivity(null, ACTIVITY_EXTRACT, null, documentURI, activities.EXCLUDED_MIMETYPE, "Downstream pipeline rejected mime type 'text/plain;charset=utf-8'"); return DOCUMENTSTATUS_REJECTED; } SpecPacker sp = new SpecPacker(pipelineDescription.getSpecification()); BoilerpipeExtractor extractorClassInstance = sp.getExtractorClassInstance(); // Tika's API reads from an input stream and writes to an output Writer. // Since a RepositoryDocument includes readers and inputstreams exclusively, AND all downstream // processing needs to occur in a ManifoldCF thread, we have some constraints on the architecture we need to get this done: // (1) The principle worker thread must call the downstream pipeline send() method. // (2) The callee of the send() method must call a reader in the Repository Document. // (3) The Reader, if its databuffer is empty, must pull more data from the original input stream and hand it to Tika, which populates the Reader's databuffer. // So all this can be done in one thread, with some work, and the creation of a special InputStream or Reader implementation. Where it fails, though, is the // requirement that tika-extracted metadata be included in the RepositoryDocument right from the beginning. Effectively this means that the entire document // must be parsed before it is handed downstream -- so basically a temporary file (or in-memory buffer if small enough) must be created. // Instead of the elegant flow above, we have the following: // (1) Create a temporary file (or in-memory buffer if file is small enough) // (2) Run Tika to completion, streaming content output to temporary file // (3) Modify RepositoryDocument to read from temporary file, and include Tika-extracted metadata // (4) Call downstream document processing DestinationStorage ds; if (document.getBinaryLength() <= inMemoryMaximumFile) { ds = new MemoryDestinationStorage((int)document.getBinaryLength()); } else { ds = new FileDestinationStorage(); } try { Metadata metadata = new Metadata(); if (document.getFileName() != null) { metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, document.getFileName()); metadata.add("stream_name", document.getFileName()); } if (document.getMimeType() != null) metadata.add("Content-Type", document.getMimeType()); metadata.add("stream_size", new Long(document.getBinaryLength()).toString()); // We only log the extraction long startTime = System.currentTimeMillis(); String resultCode = "OK"; String description = null; Long length = null; try { OutputStream os = ds.getOutputStream(); try { Writer w = new OutputStreamWriter(os,"utf-8"); try { // Use tika to parse stuff ContentHandler handler = TikaParser.newWriteOutBodyContentHandler(w, sp.writeLimit()); if (extractorClassInstance != null) handler = new BoilerpipeContentHandler(handler, extractorClassInstance); try { TikaParser.parse(document.getBinaryStream(), metadata, handler); } catch (TikaException e) { if (sp.ignoreTikaException()) { resultCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT); description = e.getMessage(); } else { resultCode = "TIKAREJECTION"; description = e.getMessage(); int rval = handleTikaException(e); if (rval == DOCUMENTSTATUS_REJECTED) activities.noDocument(); return rval; } } catch (SAXException e) { resultCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT); description = e.getMessage(); int rval = handleSaxException(e); if (rval == DOCUMENTSTATUS_REJECTED) activities.noDocument(); return rval; } catch (IOException e) { resultCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT); description = e.getMessage(); throw e; } } finally { w.flush(); } } finally { os.close(); length = new Long(ds.getBinaryLength()); } // Check to be sure downstream pipeline will accept document of specified length if (!activities.checkLengthIndexable(ds.getBinaryLength())) { activities.noDocument(); resultCode = activities.EXCLUDED_LENGTH; description = "Downstream pipeline rejected document with length "+ds.getBinaryLength(); return DOCUMENTSTATUS_REJECTED; } } finally { // Log the extraction processing activities.recordActivity(new Long(startTime), ACTIVITY_EXTRACT, length, documentURI, resultCode, description); } // Parsing complete! // Create a copy of Repository Document RepositoryDocument docCopy = document.duplicate(); // Get new stream length long newBinaryLength = ds.getBinaryLength(); // Open new input stream InputStream is = ds.getInputStream(); try { docCopy.setBinary(is,newBinaryLength); // Set up all metadata from Tika. We may want to run this through a mapper eventually... String[] metaNames = metadata.names(); for(String mName : metaNames){ String value = metadata.get(mName); if (sp.lowerNames()) { StringBuilder sb = new StringBuilder(); for (int i=0; i<mName.length(); i++) { char ch = mName.charAt(i); if (!Character.isLetterOrDigit(ch)) ch='_'; else ch=Character.toLowerCase(ch); sb.append(ch); } mName = sb.toString(); } String target = sp.getMapping(mName); if(target!=null) { docCopy.addField(target, value); } else { if(sp.keepAllMetadata()) { docCopy.addField(mName, value); } } } // Send new document downstream return activities.sendDocument(documentURI,docCopy); } finally { is.close(); } } finally { ds.close(); } } /** Obtain the name of the form check javascript method to call. *@param connectionSequenceNumber is the unique number of this connection within the job. *@return the name of the form check javascript method. */ @Override public String getFormCheckJavascriptMethodName(int connectionSequenceNumber) { return "s"+connectionSequenceNumber+"_checkSpecification"; } /** Obtain the name of the form presave check javascript method to call. *@param connectionSequenceNumber is the unique number of this connection within the job. *@return the name of the form presave check javascript method. */ @Override public String getFormPresaveCheckJavascriptMethodName(int connectionSequenceNumber) { return "s"+connectionSequenceNumber+"_checkSpecificationForSave"; } /** Output the specification header section. * This method is called in the head section of a job page which has selected a pipeline connection of the current type. Its purpose is to add the required tabs * to the list, and to output any javascript methods that might be needed by the job editing HTML. *@param out is the output to which any HTML should be sent. *@param locale is the preferred local of the output. *@param os is the current pipeline specification for this connection. *@param connectionSequenceNumber is the unique number of this connection within the job. *@param tabsArray is an array of tab names. Add to this array any tab names that are specific to the connector. */ @Override public void outputSpecificationHeader(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber, List<String> tabsArray) throws ManifoldCFException, IOException { Map<String, Object> paramMap = new HashMap<String, Object>(); paramMap.put("SEQNUM",Integer.toString(connectionSequenceNumber)); tabsArray.add(Messages.getString(locale, "TikaExtractor.FieldMappingTabName")); tabsArray.add(Messages.getString(locale, "TikaExtractor.ExceptionsTabName")); tabsArray.add(Messages.getString(locale, "TikaExtractor.BoilerplateTabName")); // Fill in the specification header map, using data from all tabs. fillInFieldMappingSpecificationMap(paramMap, os); fillInExceptionsSpecificationMap(paramMap, os); fillInBoilerplateSpecificationMap(paramMap, os); Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_JS,paramMap); } /** Output the specification body section. * This method is called in the body section of a job page which has selected a pipeline connection of the current type. Its purpose is to present the required form elements for editing. * The coder can presume that the HTML that is output from this configuration will be within appropriate <html>, <body>, and <form> tags. The name of the * form is "editjob". *@param out is the output to which any HTML should be sent. *@param locale is the preferred local of the output. *@param os is the current pipeline specification for this job. *@param connectionSequenceNumber is the unique number of this connection within the job. *@param actualSequenceNumber is the connection within the job that has currently been selected. *@param tabName is the current tab name. */ @Override public void outputSpecificationBody(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber, int actualSequenceNumber, String tabName) throws ManifoldCFException, IOException { Map<String, Object> paramMap = new HashMap<String, Object>(); // Set the tab name paramMap.put("TABNAME", tabName); paramMap.put("SEQNUM",Integer.toString(connectionSequenceNumber)); paramMap.put("SELECTEDNUM",Integer.toString(actualSequenceNumber)); // Fill in the field mapping tab data fillInFieldMappingSpecificationMap(paramMap, os); fillInExceptionsSpecificationMap(paramMap, os); fillInBoilerplateSpecificationMap(paramMap, os); Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_FIELDMAPPING_HTML,paramMap); Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_EXCEPTIONS_HTML,paramMap); Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_BOILERPLATE_HTML,paramMap); } /** Process a specification post. * This method is called at the start of job's edit or view page, whenever there is a possibility that form data for a connection has been * posted. Its purpose is to gather form information and modify the transformation specification accordingly. * The name of the posted form is "editjob". *@param variableContext contains the post data, including binary file-upload information. *@param locale is the preferred local of the output. *@param os is the current pipeline specification for this job. *@param connectionSequenceNumber is the unique number of this connection within the job. *@return null if all is well, or a string error message if there is an error that should prevent saving of the job (and cause a redirection to an error page). */ @Override public String processSpecificationPost(IPostParameters variableContext, Locale locale, Specification os, int connectionSequenceNumber) throws ManifoldCFException { String seqPrefix = "s"+connectionSequenceNumber+"_"; String x; x = variableContext.getParameter(seqPrefix+"fieldmapping_count"); if (x != null && x.length() > 0) { // About to gather the fieldmapping nodes, so get rid of the old ones. int i = 0; while (i < os.getChildCount()) { SpecificationNode node = os.getChild(i); if (node.getType().equals(TikaConfig.NODE_FIELDMAP) || node.getType().equals(TikaConfig.NODE_KEEPMETADATA) || node.getType().equals(TikaConfig.NODE_LOWERNAMES) || node.getType().equals(TikaConfig.NODE_WRITELIMIT)) os.removeChild(i); else i++; } int count = Integer.parseInt(x); i = 0; while (i < count) { String prefix = seqPrefix+"fieldmapping_"; String suffix = "_"+Integer.toString(i); String op = variableContext.getParameter(prefix+"op"+suffix); if (op == null || !op.equals("Delete")) { // Gather the fieldmap etc. String source = variableContext.getParameter(prefix+"source"+suffix); String target = variableContext.getParameter(prefix+"target"+suffix); if (target == null) target = ""; SpecificationNode node = new SpecificationNode(TikaConfig.NODE_FIELDMAP); node.setAttribute(TikaConfig.ATTRIBUTE_SOURCE,source); node.setAttribute(TikaConfig.ATTRIBUTE_TARGET,target); os.addChild(os.getChildCount(),node); } i++; } String addop = variableContext.getParameter(seqPrefix+"fieldmapping_op"); if (addop != null && addop.equals("Add")) { String source = variableContext.getParameter(seqPrefix+"fieldmapping_source"); String target = variableContext.getParameter(seqPrefix+"fieldmapping_target"); if (target == null) target = ""; SpecificationNode node = new SpecificationNode(TikaConfig.NODE_FIELDMAP); node.setAttribute(TikaConfig.ATTRIBUTE_SOURCE,source); node.setAttribute(TikaConfig.ATTRIBUTE_TARGET,target); os.addChild(os.getChildCount(),node); } // Gather the keep all metadata parameter to be the last one SpecificationNode node = new SpecificationNode(TikaConfig.NODE_KEEPMETADATA); String keepAll = variableContext.getParameter(seqPrefix+"keepallmetadata"); if (keepAll != null) { node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, keepAll); } else { node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false"); } // Add the new keepallmetadata config parameter os.addChild(os.getChildCount(), node); SpecificationNode node2 = new SpecificationNode(TikaConfig.NODE_LOWERNAMES); String lower = variableContext.getParameter(seqPrefix+"lowernames"); if (lower != null) { node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, lower); } else { node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false"); } os.addChild(os.getChildCount(), node2); SpecificationNode node3 = new SpecificationNode(TikaConfig.NODE_WRITELIMIT); String writeLimit = variableContext.getParameter(seqPrefix+"writelimit"); if (writeLimit != null) { node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, writeLimit); } else { node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, ""); } os.addChild(os.getChildCount(), node3); } if (variableContext.getParameter(seqPrefix+"ignoretikaexceptions_present") != null) { int i = 0; while (i < os.getChildCount()) { SpecificationNode node = os.getChild(i); if (node.getType().equals(TikaConfig.NODE_IGNORETIKAEXCEPTION)) os.removeChild(i); else i++; } String value = variableContext.getParameter(seqPrefix+"ignoretikaexceptions"); if (value == null) value = "false"; SpecificationNode node = new SpecificationNode(TikaConfig.NODE_IGNORETIKAEXCEPTION); node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, value); os.addChild(os.getChildCount(), node); } x = variableContext.getParameter(seqPrefix+"boilerplateclassname"); if (x != null) { int i = 0; while (i < os.getChildCount()) { SpecificationNode node = os.getChild(i); if (node.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR)) os.removeChild(i); else i++; } if (x.length() > 0) { SpecificationNode node = new SpecificationNode(TikaConfig.NODE_BOILERPLATEPROCESSOR); node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, x); os.addChild(os.getChildCount(), node); } } return null; } /** View specification. * This method is called in the body section of a job's view page. Its purpose is to present the pipeline specification information to the user. * The coder can presume that the HTML that is output from this configuration will be within appropriate <html> and <body> tags. *@param out is the output to which any HTML should be sent. *@param locale is the preferred local of the output. *@param connectionSequenceNumber is the unique number of this connection within the job. *@param os is the current pipeline specification for this job. */ @Override public void viewSpecification(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber) throws ManifoldCFException, IOException { Map<String, Object> paramMap = new HashMap<String, Object>(); paramMap.put("SEQNUM",Integer.toString(connectionSequenceNumber)); // Fill in the map with data from all tabs fillInFieldMappingSpecificationMap(paramMap, os); fillInExceptionsSpecificationMap(paramMap, os); fillInBoilerplateSpecificationMap(paramMap, os); Messages.outputResourceWithVelocity(out,locale,VIEW_SPECIFICATION_HTML,paramMap); } protected static void fillInFieldMappingSpecificationMap(Map<String,Object> paramMap, Specification os) { // Prep for field mappings List<Map<String,String>> fieldMappings = new ArrayList<Map<String,String>>(); String keepAllMetadataValue = "true"; String lowernamesValue = "false"; String writeLimitValue = ""; for (int i = 0; i < os.getChildCount(); i++) { SpecificationNode sn = os.getChild(i); if (sn.getType().equals(TikaConfig.NODE_FIELDMAP)) { String source = sn.getAttributeValue(TikaConfig.ATTRIBUTE_SOURCE); String target = sn.getAttributeValue(TikaConfig.ATTRIBUTE_TARGET); String targetDisplay; if (target == null) { target = ""; targetDisplay = "(remove)"; } else targetDisplay = target; Map<String,String> fieldMapping = new HashMap<String,String>(); fieldMapping.put("SOURCE",source); fieldMapping.put("TARGET",target); fieldMapping.put("TARGETDISPLAY",targetDisplay); fieldMappings.add(fieldMapping); } else if (sn.getType().equals(TikaConfig.NODE_KEEPMETADATA)) { keepAllMetadataValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE); } else if (sn.getType().equals(TikaConfig.NODE_LOWERNAMES)) { lowernamesValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE); } else if (sn.getType().equals(TikaConfig.NODE_WRITELIMIT)) { writeLimitValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE); } } paramMap.put("FIELDMAPPINGS",fieldMappings); paramMap.put("KEEPALLMETADATA",keepAllMetadataValue); paramMap.put("LOWERNAMES",lowernamesValue); paramMap.put("WRITELIMIT",writeLimitValue); } protected static void fillInExceptionsSpecificationMap(Map<String,Object> paramMap, Specification os) { String ignoreTikaExceptions = "true"; for (int i = 0; i < os.getChildCount(); i++) { SpecificationNode sn = os.getChild(i); if (sn.getType().equals(TikaConfig.NODE_IGNORETIKAEXCEPTION)) { ignoreTikaExceptions = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE); } } paramMap.put("IGNORETIKAEXCEPTIONS",ignoreTikaExceptions); } protected static void fillInBoilerplateSpecificationMap(Map<String,Object> paramMap, Specification os) { String boilerplateClassName = ""; for (int i = 0; i < os.getChildCount(); i++) { SpecificationNode sn = os.getChild(i); if (sn.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR)) { boilerplateClassName = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE); } } paramMap.put("BOILERPLATECLASSNAME",boilerplateClassName); } protected static int handleTikaException(TikaException e) throws IOException, ManifoldCFException, ServiceInterruption { // MHL - what does Tika throw if it gets an IOException reading the stream?? Logging.ingest.warn("Tika: Tika exception extracting: "+e.getMessage(),e); return DOCUMENTSTATUS_REJECTED; } protected static int handleSaxException(SAXException e) throws IOException, ManifoldCFException, ServiceInterruption { // MHL - what does this mean? Logging.ingest.warn("Tika: SAX exception extracting: "+e.getMessage(),e); return DOCUMENTSTATUS_REJECTED; } protected static int handleIOException(IOException e) throws ManifoldCFException { // IOException reading from our local storage... if (e instanceof InterruptedIOException) throw new ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED); throw new ManifoldCFException(e.getMessage(),e); } protected static interface DestinationStorage { /** Get the output stream to write to. Caller should explicitly close this stream when done writing. */ public OutputStream getOutputStream() throws ManifoldCFException; /** Get new binary length. */ public long getBinaryLength() throws ManifoldCFException; /** Get the input stream to read from. Caller should explicitly close this stream when done reading. */ public InputStream getInputStream() throws ManifoldCFException; /** Close the object and clean up everything. * This should be called when the data is no longer needed. */ public void close() throws ManifoldCFException; } protected static class FileDestinationStorage implements DestinationStorage { protected final File outputFile; protected final OutputStream outputStream; public FileDestinationStorage() throws ManifoldCFException { File outputFile; OutputStream outputStream; try { outputFile = File.createTempFile("mcftika","tmp"); outputStream = new FileOutputStream(outputFile); } catch (IOException e) { handleIOException(e); outputFile = null; outputStream = null; } this.outputFile = outputFile; this.outputStream = outputStream; } @Override public OutputStream getOutputStream() throws ManifoldCFException { return outputStream; } /** Get new binary length. */ @Override public long getBinaryLength() throws ManifoldCFException { return outputFile.length(); } /** Get the input stream to read from. Caller should explicitly close this stream when done reading. */ @Override public InputStream getInputStream() throws ManifoldCFException { try { return new FileInputStream(outputFile); } catch (IOException e) { handleIOException(e); return null; } } /** Close the object and clean up everything. * This should be called when the data is no longer needed. */ @Override public void close() throws ManifoldCFException { outputFile.delete(); } } protected static class MemoryDestinationStorage implements DestinationStorage { protected final ByteArrayOutputStream outputStream; public MemoryDestinationStorage(int sizeHint) { outputStream = new ByteArrayOutputStream(sizeHint); } @Override public OutputStream getOutputStream() throws ManifoldCFException { return outputStream; } /** Get new binary length. */ @Override public long getBinaryLength() throws ManifoldCFException { return outputStream.size(); } /** Get the input stream to read from. Caller should explicitly close this stream when done reading. */ @Override public InputStream getInputStream() throws ManifoldCFException { return new ByteArrayInputStream(outputStream.toByteArray()); } /** Close the object and clean up everything. * This should be called when the data is no longer needed. */ public void close() throws ManifoldCFException { } } protected static class SpecPacker { private final Map<String,String> sourceTargets = new HashMap<String,String>(); private final boolean keepAllMetadata; private final boolean lowerNames; private final int writeLimit; private final boolean ignoreTikaException; private final String extractorClassName; public SpecPacker(Specification os) { boolean keepAllMetadata = true; boolean lowerNames = false; int writeLimit = TikaConfig.WRITELIMIT_DEFAULT; boolean ignoreTikaException = true; String extractorClassName = null; for (int i = 0; i < os.getChildCount(); i++) { SpecificationNode sn = os.getChild(i); if(sn.getType().equals(TikaConfig.NODE_KEEPMETADATA)) { String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE); keepAllMetadata = Boolean.parseBoolean(value); } else if(sn.getType().equals(TikaConfig.NODE_LOWERNAMES)) { String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE); lowerNames = Boolean.parseBoolean(value); } else if(sn.getType().equals(TikaConfig.NODE_WRITELIMIT)) { String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE); if (value.length() == 0) { writeLimit = TikaConfig.WRITELIMIT_DEFAULT; } else { writeLimit = Integer.parseInt(value); } } else if (sn.getType().equals(TikaConfig.NODE_FIELDMAP)) { String source = sn.getAttributeValue(TikaConfig.ATTRIBUTE_SOURCE); String target = sn.getAttributeValue(TikaConfig.ATTRIBUTE_TARGET); if (target == null) { target = ""; } sourceTargets.put(source, target); } else if (sn.getType().equals(TikaConfig.NODE_IGNORETIKAEXCEPTION)) { String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE); ignoreTikaException = Boolean.parseBoolean(value); } else if (sn.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR)) { extractorClassName = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE); } } this.keepAllMetadata = keepAllMetadata; this.lowerNames = lowerNames; this.writeLimit = writeLimit; this.ignoreTikaException = ignoreTikaException; this.extractorClassName = extractorClassName; } public String toPackedString() { StringBuilder sb = new StringBuilder(); int i; // Mappings final String[] sortArray = new String[sourceTargets.size()]; i = 0; for (String source : sourceTargets.keySet()) { sortArray[i++] = source; } java.util.Arrays.sort(sortArray); List<String> packedMappings = new ArrayList<String>(); String[] fixedList = new String[2]; for (String source : sortArray) { String target = sourceTargets.get(source); StringBuilder localBuffer = new StringBuilder(); fixedList[0] = source; fixedList[1] = target; packFixedList(localBuffer,fixedList,':'); packedMappings.add(localBuffer.toString()); } packList(sb,packedMappings,'+'); // Keep all metadata if (keepAllMetadata) sb.append('+'); else sb.append('-'); if (lowerNames) sb.append('+'); else sb.append('-'); if (writeLimit != TikaConfig.WRITELIMIT_DEFAULT) { sb.append('+'); sb.append(writeLimit); } if (ignoreTikaException) sb.append('+'); else sb.append('-'); if (extractorClassName != null) { sb.append('+'); sb.append(extractorClassName); } else sb.append('-'); return sb.toString(); } public String getMapping(String source) { return sourceTargets.get(source); } public boolean keepAllMetadata() { return keepAllMetadata; } public boolean lowerNames() { return lowerNames; } public int writeLimit() { return writeLimit; } public boolean ignoreTikaException() { return ignoreTikaException; } public BoilerpipeExtractor getExtractorClassInstance() throws ManifoldCFException { if (extractorClassName == null) return null; try { ClassLoader loader = BoilerpipeExtractor.class.getClassLoader(); Class extractorClass = loader.loadClass(extractorClassName); java.lang.reflect.Field f = extractorClass.getField("INSTANCE"); return (BoilerpipeExtractor)f.get(null); } catch (ClassNotFoundException e) { throw new ManifoldCFException("Boilerpipe extractor class '"+extractorClassName+"' not found: "+e.getMessage(),e); } catch (Exception e) { throw new ManifoldCFException("Boilerpipe extractor class '"+extractorClassName+"' exception on instantiation: "+e.getMessage(),e); } } } }