/* $Id$ */ /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.manifoldcf.agents.transformation.documentfilter; import org.apache.manifoldcf.core.interfaces.*; import org.apache.manifoldcf.agents.interfaces.*; import org.apache.manifoldcf.agents.system.ManifoldCF; import org.apache.manifoldcf.agents.system.Logging; import org.apache.commons.io.FilenameUtils; import java.io.*; import java.util.*; import java.net.*; public class DocumentFilter extends org.apache.manifoldcf.agents.transformation.BaseTransformationConnector { /** Forward to the javascript to check the specification parameters for the job */ private static final String EDIT_SPECIFICATION_JS = "editSpecification.js"; private static final String EDIT_SPECIFICATION_CONTENTS_HTML = "editSpecification_Contents.html"; private static final String VIEW_SPECIFICATION_HTML = "viewSpecification.html"; protected static final String ACTIVITY_FILTER = "filter"; protected static final String[] activitiesList = new String[]{ACTIVITY_FILTER}; /** Return a list of activities that this connector generates. * The connector does NOT need to be connected before this method is called. *@return the set of activities. */ @Override public String[] getActivitiesList() { return activitiesList; } /** Constructor. */ public DocumentFilter(){ } /** Get an output version string, given an output specification. The output version string is used to uniquely describe the pertinent details of * the output specification and the configuration, to allow the Connector Framework to determine whether a document will need to be output again. * Note that the contents of the document cannot be considered by this method, and that a different version string (defined in IRepositoryConnector) * is used to describe the version of the actual document. * * This method presumes that the connector object has been configured, and it is thus able to communicate with the output data store should that be * necessary. *@param os is the current output specification for the job that is doing the crawling. *@return a string, of unlimited length, which uniquely describes output configuration and specification in such a way that if two such strings are equal, * the document will not need to be sent again to the output data store. */ @Override public VersionContext getPipelineDescription(Specification os) throws ManifoldCFException, ServiceInterruption { SpecPacker sp = new SpecPacker(os); return new VersionContext(sp.toPackedString(),params,os); } /** Detect if a document date is acceptable or not. This method is used to determine whether it makes sense to fetch a document * in the first place. *@param outputDescription is the document's output version. *@param date is the date of the document. *@param activities is an object including the activities that can be performed by this method. *@return true if the document with that date can be accepted by this connector. */ @Override public boolean checkDateIndexable(VersionContext outputDescription, Date date, IOutputCheckActivity activities) throws ManifoldCFException, ServiceInterruption { SpecPacker sp = new SpecPacker(outputDescription.getSpecification()); return checkDateIndexable(sp, outputDescription, date, activities); } protected boolean checkDateIndexable(SpecPacker sp, VersionContext outputDescription, Date date, IOutputCheckActivity activities) throws ManifoldCFException, ServiceInterruption { if (sp.checkDate(date)) return super.checkDateIndexable(outputDescription, date, activities); else return false; } /** Detect if a mime type is indexable or not. This method is used by participating repository connectors to pre-filter the number of * unusable documents that will be passed to this output connector. *@param outputDescription is the document's output version. *@param mimeType is the mime type of the document. *@return true if the mime type is indexable by this connector. */ @Override public boolean checkMimeTypeIndexable(VersionContext outputDescription, String mimeType, IOutputCheckActivity activities) throws ManifoldCFException, ServiceInterruption { SpecPacker sp = new SpecPacker(outputDescription.getSpecification()); return checkMimeTypeIndexable(sp, outputDescription, mimeType, activities); } protected boolean checkMimeTypeIndexable(SpecPacker sp, VersionContext outputDescription, String mimeType, IOutputCheckActivity activities) throws ManifoldCFException, ServiceInterruption { if (sp.checkMimeType(mimeType)) return super.checkMimeTypeIndexable(outputDescription, mimeType, activities); else return false; } @Override public boolean checkLengthIndexable(VersionContext outputDescription, long length, IOutputCheckActivity activities) throws ManifoldCFException, ServiceInterruption { SpecPacker sp = new SpecPacker(outputDescription.getSpecification()); return checkLengthIndexable(sp, outputDescription, length, activities); } protected boolean checkLengthIndexable(SpecPacker sp, VersionContext outputDescription, long length, IOutputCheckActivity activities) throws ManifoldCFException, ServiceInterruption { if (sp.checkLengthIndexable(length)) return super.checkLengthIndexable(outputDescription, length, activities); else return false; } @Override public boolean checkURLIndexable(VersionContext outputDescription, String url, IOutputCheckActivity activities) throws ManifoldCFException, ServiceInterruption { SpecPacker sp = new SpecPacker(outputDescription.getSpecification()); return checkURLIndexable(sp, outputDescription, url, activities); } protected boolean checkURLIndexable(SpecPacker sp, VersionContext outputDescription, String url, IOutputCheckActivity activities) throws ManifoldCFException, ServiceInterruption { if (sp.checkURLIndexable(url)) return super.checkURLIndexable(outputDescription, url, activities); else return false; } /** Add (or replace) a document in the output data store using the connector. * This method presumes that the connector object has been configured, and it is thus able to communicate with the output data store should that be * necessary. *@param documentURI is the URI of the document. The URI is presumed to be the unique identifier which the output data store will use to process * and serve the document. This URI is constructed by the repository connector which fetches the document, and is thus universal across all output connectors. *@param outputDescription is the description string that was constructed for this document by the getOutputDescription() method. *@param document is the document data to be processed (handed to the output data store). *@param authorityNameString is the name of the authority responsible for authorizing any access tokens passed in with the repository document. May be null. *@param activities is the handle to an object that the implementer of an output connector may use to perform operations, such as logging processing activity. *@return the document status (accepted or permanently rejected). */ @Override public int addOrReplaceDocumentWithException(String documentURI, VersionContext outputDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities) throws ManifoldCFException, ServiceInterruption, IOException { // Hard filtering (in case connectors don't call check methods above) SpecPacker sp = new SpecPacker(outputDescription.getSpecification()); if (!checkURLIndexable(sp, outputDescription, documentURI, activities)) { activities.noDocument(); activities.recordActivity(null, ACTIVITY_FILTER, null, documentURI, activities.EXCLUDED_URL, "Rejected due to URL ('"+documentURI+"')"); if (Logging.ingest.isDebugEnabled()) Logging.ingest.debug("Document filter: Rejected document "+documentURI+" due to URL ('"+documentURI+"')"); return DOCUMENTSTATUS_REJECTED; } if (!checkLengthIndexable(sp, outputDescription, document.getBinaryLength(), activities)) { activities.noDocument(); activities.recordActivity(null, ACTIVITY_FILTER, null, documentURI, activities.EXCLUDED_LENGTH, "Rejected due to length ("+document.getBinaryLength()+")"); if (Logging.ingest.isDebugEnabled()) Logging.ingest.debug("Document filter: Rejected document "+documentURI+" due to length ("+document.getBinaryLength()+")"); return DOCUMENTSTATUS_REJECTED; } if (!checkMimeTypeIndexable(sp, outputDescription, document.getMimeType(), activities)) { activities.noDocument(); activities.recordActivity(null, ACTIVITY_FILTER, null, documentURI, activities.EXCLUDED_MIMETYPE, "Rejected due to mime type ('"+document.getMimeType()+"')"); if (Logging.ingest.isDebugEnabled()) Logging.ingest.debug("Document filter: Rejected document "+documentURI+" due to mime type ('"+document.getMimeType()+"')"); return DOCUMENTSTATUS_REJECTED; } if (!checkDateIndexable(sp, outputDescription, document.getModifiedDate(), activities)) { activities.noDocument(); activities.recordActivity(null, ACTIVITY_FILTER, null, documentURI, activities.EXCLUDED_DATE, "Rejected due to date ('"+document.getModifiedDate()+"')"); if (Logging.ingest.isDebugEnabled()) Logging.ingest.debug("Document filter: Rejected document "+documentURI+" due to date ('"+document.getModifiedDate()+"')"); return DOCUMENTSTATUS_REJECTED; } return activities.sendDocument(documentURI, document); } protected static void fillInContentsSpecificationMap(Map<String,Object> paramMap, Specification os) { String minFileSize = DocumentFilterConfig.MINLENGTH_DEFAULT; String maxFileSize = DocumentFilterConfig.MAXLENGTH_DEFAULT; String allowedMimeTypes = DocumentFilterConfig.MIMETYPES_DEFAULT; String allowedFileExtensions = DocumentFilterConfig.EXTENSIONS_DEFAULT; Long minDate = null; for (int i = 0; i < os.getChildCount(); i++) { SpecificationNode sn = os.getChild(i); if (sn.getType().equals(DocumentFilterConfig.NODE_MAXLENGTH)) maxFileSize = sn.getAttributeValue(DocumentFilterConfig.ATTRIBUTE_VALUE); else if (sn.getType().equals(DocumentFilterConfig.NODE_MINLENGTH)) minFileSize = sn.getAttributeValue(DocumentFilterConfig.ATTRIBUTE_VALUE); else if (sn.getType().equals(DocumentFilterConfig.NODE_MIMETYPES)) allowedMimeTypes = sn.getValue(); else if (sn.getType().equals(DocumentFilterConfig.NODE_EXTENSIONS)) allowedFileExtensions = sn.getValue(); else if (sn.getType().equals(DocumentFilterConfig.NODE_MINDATE)) minDate = new Long(sn.getAttributeValue(DocumentFilterConfig.ATTRIBUTE_VALUE)); } paramMap.put("MINFILESIZE",minFileSize); paramMap.put("MAXFILESIZE",maxFileSize); paramMap.put("MIMETYPES",allowedMimeTypes); paramMap.put("EXTENSIONS",allowedFileExtensions); Calendar c = new GregorianCalendar(TimeZone.getTimeZone("UTC"), Locale.ROOT); c.setTimeInMillis((minDate==null)?0L:minDate.longValue()); paramMap.put("MINDATEYEAR",Integer.toString(c.get(Calendar.YEAR))); paramMap.put("MINDATEMONTH",Integer.toString(c.get(Calendar.MONTH))); paramMap.put("MINDATEDAY",Integer.toString(c.get(Calendar.DAY_OF_MONTH))); paramMap.put("MINDATEHOUR",Integer.toString(c.get(Calendar.HOUR_OF_DAY))); paramMap.put("MINDATEMINUTE",String.format(Locale.ROOT, "%02d",c.get(Calendar.MINUTE))); } /** Obtain the name of the form check javascript method to call. *@param connectionSequenceNumber is the unique number of this connection within the job. *@return the name of the form check javascript method. */ @Override public String getFormCheckJavascriptMethodName(int connectionSequenceNumber) { return "s"+connectionSequenceNumber+"_checkSpecification"; } /** Obtain the name of the form presave check javascript method to call. *@param connectionSequenceNumber is the unique number of this connection within the job. *@return the name of the form presave check javascript method. */ @Override public String getFormPresaveCheckJavascriptMethodName(int connectionSequenceNumber) { return "s"+connectionSequenceNumber+"_checkSpecificationForSave"; } /** Output the specification header section. * This method is called in the head section of a job page which has selected a pipeline connection of the current type. Its purpose is to add the required tabs * to the list, and to output any javascript methods that might be needed by the job editing HTML. *@param out is the output to which any HTML should be sent. *@param locale is the preferred local of the output. *@param os is the current pipeline specification for this connection. *@param connectionSequenceNumber is the unique number of this connection within the job. *@param tabsArray is an array of tab names. Add to this array any tab names that are specific to the connector. */ @Override public void outputSpecificationHeader(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber, List<String> tabsArray) throws ManifoldCFException, IOException { Map<String, Object> paramMap = new HashMap<String, Object>(); paramMap.put("SEQNUM",Integer.toString(connectionSequenceNumber)); tabsArray.add(Messages.getString(locale, "DocumentFilter.ContentsTabName")); // Fill in the specification header map, using data from all tabs. fillInContentsSpecificationMap(paramMap, os); Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_JS,paramMap); } /** Output the specification body section. * This method is called in the body section of a job page which has selected a pipeline connection of the current type. Its purpose is to present the required form elements for editing. * The coder can presume that the HTML that is output from this configuration will be within appropriate <html>, <body>, and <form> tags. The name of the * form is "editjob". *@param out is the output to which any HTML should be sent. *@param locale is the preferred local of the output. *@param os is the current pipeline specification for this job. *@param connectionSequenceNumber is the unique number of this connection within the job. *@param actualSequenceNumber is the connection within the job that has currently been selected. *@param tabName is the current tab name. */ @Override public void outputSpecificationBody(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber, int actualSequenceNumber, String tabName) throws ManifoldCFException, IOException { Map<String, Object> paramMap = new HashMap<String, Object>(); // Set the tab name paramMap.put("TABNAME", tabName); paramMap.put("SEQNUM",Integer.toString(connectionSequenceNumber)); paramMap.put("SELECTEDNUM",Integer.toString(actualSequenceNumber)); // Fill in the field mapping tab data fillInContentsSpecificationMap(paramMap, os); Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_CONTENTS_HTML,paramMap); } /** Process a specification post. * This method is called at the start of job's edit or view page, whenever there is a possibility that form data for a connection has been * posted. Its purpose is to gather form information and modify the transformation specification accordingly. * The name of the posted form is "editjob". *@param variableContext contains the post data, including binary file-upload information. *@param locale is the preferred local of the output. *@param os is the current pipeline specification for this job. *@param connectionSequenceNumber is the unique number of this connection within the job. *@return null if all is well, or a string error message if there is an error that should prevent saving of the job (and cause a redirection to an error page). */ @Override public String processSpecificationPost(IPostParameters variableContext, Locale locale, Specification os, int connectionSequenceNumber) throws ManifoldCFException { String seqPrefix = "s"+connectionSequenceNumber+"_"; String minDateYear = variableContext.getParameter(seqPrefix+"mindateyear"); String minDateMonth = variableContext.getParameter(seqPrefix+"mindatemonth"); String minDateDay = variableContext.getParameter(seqPrefix + "mindateday"); String minDateHour = variableContext.getParameter(seqPrefix + "mindatehour"); String minDateMinute = variableContext.getParameter(seqPrefix + "mindateminute"); if (minDateYear != null && minDateMonth != null && minDateDay != null && minDateHour != null && minDateMinute != null) { Calendar c = new GregorianCalendar(TimeZone.getTimeZone("UTC"), Locale.ROOT); c.set(Calendar.SECOND, 0); c.set(Calendar.MILLISECOND, 0); try { c.set(Integer.parseInt(minDateYear),Integer.parseInt(minDateMonth),Integer.parseInt(minDateDay),Integer.parseInt(minDateHour),Integer.parseInt(minDateMinute),0); } catch (Exception e) { } long theTime = c.getTimeInMillis(); int i = 0; while (i < os.getChildCount()) { SpecificationNode node = os.getChild(i); if (node.getType().equals(DocumentFilterConfig.NODE_MINDATE)) os.removeChild(i); else i++; } SpecificationNode sn = new SpecificationNode(DocumentFilterConfig.NODE_MINDATE); sn.setAttribute(DocumentFilterConfig.ATTRIBUTE_VALUE,new Long(theTime).toString()); os.addChild(os.getChildCount(),sn); } String x; x = variableContext.getParameter(seqPrefix+"minfilesize"); if (x != null) { int i = 0; while (i < os.getChildCount()) { SpecificationNode node = os.getChild(i); if (node.getType().equals(DocumentFilterConfig.NODE_MINLENGTH)) os.removeChild(i); else i++; } SpecificationNode sn = new SpecificationNode(DocumentFilterConfig.NODE_MINLENGTH); sn.setAttribute(DocumentFilterConfig.ATTRIBUTE_VALUE,x); os.addChild(os.getChildCount(),sn); } x = variableContext.getParameter(seqPrefix+"maxfilesize"); if (x != null) { int i = 0; while (i < os.getChildCount()) { SpecificationNode node = os.getChild(i); if (node.getType().equals(DocumentFilterConfig.NODE_MAXLENGTH)) os.removeChild(i); else i++; } SpecificationNode sn = new SpecificationNode(DocumentFilterConfig.NODE_MAXLENGTH); sn.setAttribute(DocumentFilterConfig.ATTRIBUTE_VALUE,x); os.addChild(os.getChildCount(),sn); } x = variableContext.getParameter(seqPrefix+"mimetypes"); if (x != null) { int i = 0; while (i < os.getChildCount()) { SpecificationNode node = os.getChild(i); if (node.getType().equals(DocumentFilterConfig.NODE_MIMETYPES)) os.removeChild(i); else i++; } SpecificationNode sn = new SpecificationNode(DocumentFilterConfig.NODE_MIMETYPES); sn.setValue(x); os.addChild(os.getChildCount(),sn); } x = variableContext.getParameter(seqPrefix+"extensions"); if (x != null) { int i = 0; while (i < os.getChildCount()) { SpecificationNode node = os.getChild(i); if (node.getType().equals(DocumentFilterConfig.NODE_EXTENSIONS)) os.removeChild(i); else i++; } SpecificationNode sn = new SpecificationNode(DocumentFilterConfig.NODE_EXTENSIONS); sn.setValue(x); os.addChild(os.getChildCount(),sn); } return null; } /** View specification. * This method is called in the body section of a job's view page. Its purpose is to present the pipeline specification information to the user. * The coder can presume that the HTML that is output from this configuration will be within appropriate <html> and <body> tags. *@param out is the output to which any HTML should be sent. *@param locale is the preferred local of the output. *@param connectionSequenceNumber is the unique number of this connection within the job. *@param os is the current pipeline specification for this job. */ @Override public void viewSpecification(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber) throws ManifoldCFException, IOException { Map<String, Object> paramMap = new HashMap<String, Object>(); paramMap.put("SEQNUM",Integer.toString(connectionSequenceNumber)); // Fill in the map with data from all tabs fillInContentsSpecificationMap(paramMap, os); Messages.outputResourceWithVelocity(out,locale,VIEW_SPECIFICATION_HTML,paramMap); } protected static Set<String> fillSet(String input) { Set<String> rval = new HashSet<String>(); try { StringReader sr = new StringReader(input); BufferedReader br = new BufferedReader(sr); String line = null; while ((line = br.readLine()) != null) { line = line.trim(); if (line.equals("*")) rval = null; else if (rval != null && line.length() > 0) rval.add(line.toLowerCase(Locale.ROOT)); } } catch (IOException e) { // Should never happen throw new RuntimeException("IO exception reading strings: "+e.getMessage(),e); } return rval; } protected static class SpecPacker { // null means "match everything" private final Set<String> extensions; // null means "match everything" private final Set<String> mimeTypes; private final Long minLength; private final Long lengthCutoff; private final Long minDate; public SpecPacker(Specification os) { Long minDate = null; Long minLength = null; Long lengthCutoff = null; String extensions = null; String mimeTypes = null; for (int i = 0; i < os.getChildCount(); i++) { SpecificationNode sn = os.getChild(i); if (sn.getType().equals(DocumentFilterConfig.NODE_MIMETYPES)) { mimeTypes = sn.getValue(); } else if (sn.getType().equals(DocumentFilterConfig.NODE_EXTENSIONS)) { extensions = sn.getValue(); } else if (sn.getType().equals(DocumentFilterConfig.NODE_MAXLENGTH)) { String value = sn.getAttributeValue(DocumentFilterConfig.ATTRIBUTE_VALUE); lengthCutoff = new Long(value); } else if (sn.getType().equals(DocumentFilterConfig.NODE_MINLENGTH)) { String value = sn.getAttributeValue(DocumentFilterConfig.ATTRIBUTE_VALUE); minLength = new Long(value); } else if (sn.getType().equals(DocumentFilterConfig.NODE_MINDATE)) { String value = sn.getAttributeValue(DocumentFilterConfig.ATTRIBUTE_VALUE); minDate = new Long(value); } } this.minDate = minDate; this.minLength = minLength; this.lengthCutoff = lengthCutoff; this.extensions = fillSet(extensions); this.mimeTypes = fillSet(mimeTypes); } public String toPackedString() { StringBuilder sb = new StringBuilder(); int i; // Max length if (lengthCutoff == null) sb.append('-'); else { sb.append('+'); pack(sb,lengthCutoff.toString(),'+'); } // Mime types if (this.mimeTypes == null) sb.append('-'); else { sb.append('+'); String[] mimeTypes = new String[this.mimeTypes.size()]; i = 0; for (String mimeType : this.mimeTypes) { mimeTypes[i++] = mimeType; } java.util.Arrays.sort(mimeTypes); packList(sb,mimeTypes,'+'); } // Extensions if (this.extensions == null) sb.append('-'); else { sb.append('+'); String[] extensions = new String[this.extensions.size()]; i = 0; for (String extension : this.extensions) { extensions[i++] = extension; } java.util.Arrays.sort(extensions); packList(sb,extensions,'+'); } // Min length if (minLength == null) sb.append('-'); else { sb.append('+'); pack(sb,minLength.toString(),'+'); } // Min date if (minDate == null) sb.append('-'); else { sb.append('+'); pack(sb,minDate.toString(),'+'); } return sb.toString(); } public boolean checkLengthIndexable(long length) { if (minLength != null && length < minLength.longValue()) return false; if (lengthCutoff != null && length > lengthCutoff.longValue()) return false; return true; } public boolean checkDate(Date date) { if (minDate != null && date != null && date.getTime() < minDate) return false; return true; } public boolean checkMimeType(String mimeType) { if (mimeType == null) mimeType = "application/unknown"; if (mimeTypes == null) return true; return mimeTypes.contains(mimeType.toLowerCase(Locale.ROOT)); } public boolean checkURLIndexable(String url) { String extension = null; try { String path = new URI(url).getPath(); if (path != null) extension = FilenameUtils.getExtension(path); } catch (URISyntaxException e) { extension = FilenameUtils.getExtension(url); } if (extension == null || extension.length() == 0) extension = "."; if (extensions == null) return true; return extensions.contains(extension.toLowerCase(Locale.ROOT)); } } }