/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to You under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package org.apache.manifoldcf.crawler.connectors.amazons3;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InterruptedIOException;
import java.util.Date;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.manifoldcf.agents.interfaces.RepositoryDocument;
import org.apache.manifoldcf.agents.interfaces.ServiceInterruption;
import org.apache.manifoldcf.amazons3.S3Artifact;
import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
import org.apache.manifoldcf.core.interfaces.Specification;
import org.apache.manifoldcf.crawler.interfaces.IExistingVersions;
import org.apache.manifoldcf.crawler.interfaces.IProcessActivity;
import org.apache.manifoldcf.crawler.system.Logging;
import com.amazonaws.AmazonClientException;
import com.amazonaws.AmazonServiceException;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.model.AccessControlList;
import com.amazonaws.services.s3.model.GetObjectRequest;
import com.amazonaws.services.s3.model.Grant;
import com.amazonaws.services.s3.model.ObjectMetadata;
import com.amazonaws.services.s3.model.S3Object;
/**
* Generic amazons3 extractor
* @author Kuhajeyan
*
*/
public class GenericDocumentProcess extends AmazonS3DocumentProcessUtility
implements DocumentProcess {
private static final String TEXT_PLAIN = "text/plain";
/**
* Process documents with out any tika extractor
* @param documentIdentifiers
* @param statuses
* @param spec
* @param activities
* @param jobMode
* @param usesDefaultAuthority
* @param amazons3Client
* @throws ManifoldCFException
*/
@Override
public void doProcessDocument(String[] documentIdentifiers,
IExistingVersions statuses, Specification spec,
IProcessActivity activities, int jobMode,
boolean usesDefaultAuthority, AmazonS3 amazons3Client)
throws ManifoldCFException, ServiceInterruption {
if (amazons3Client == null)
throw new ManifoldCFException(
"Amazon client can not connect at the moment");
for (String documentIdentifier : documentIdentifiers) {
try {
if (documentIdentifier == null
|| StringUtils.isEmpty(documentIdentifier)) {
Logging.connectors
.warn("Document identifier is empty, document will not be processed");
continue;
}
String versionString;
String[] aclsToUse;
if (documentIdentifier
.split(AmazonS3Config.STD_SEPARATOR_BUCKET_AND_KEY) == null
&& documentIdentifier.length() < 1) {
continue;
}
S3Artifact s3Artifact = getS3Artifact(documentIdentifier);
S3Object s3Obj = amazons3Client.getObject(new GetObjectRequest(
s3Artifact.getBucketName(), s3Artifact.getKey()));
if (s3Obj == null) {
// no such document in the bucket now
// delete document
activities.deleteDocument(documentIdentifier);
continue;
}
Logging.connectors.info("Content-Type: "
+ s3Obj.getObjectMetadata().getContentType());
ObjectMetadata objectMetadata = s3Obj.getObjectMetadata();
Date lastModified = objectMetadata.getLastModified();
StringBuilder sb = new StringBuilder();
if (lastModified == null) {
// remove the content
activities.deleteDocument(documentIdentifier);
continue;
}
aclsToUse = new String[0];
AccessControlList objectAcl = amazons3Client.getObjectAcl(
s3Artifact.getBucketName(), s3Artifact.getKey());
Set<Grant> grants = objectAcl.getGrants();
String[] users = getUsers(grants);
aclsToUse = users;
sb.append(lastModified.toString());
versionString = sb.toString();
Logging.connectors.debug("version string : " + versionString);
if (versionString.length() > 0
&& !activities.checkDocumentNeedsReindexing(
documentIdentifier, versionString)) {
Logging.connectors
.info("Document need not to be reindexed : "
+ documentIdentifier);
continue;
}
Logging.connectors
.debug("JIRA: Processing document identifier '"
+ documentIdentifier + "'");
long startTime = System.currentTimeMillis();
String errorCode = null;
String errorDesc = null;
Long fileSize = null;
String mimeType = TEXT_PLAIN;// default
long fileLength = s3Obj.getObjectMetadata().getContentLength();
if (!activities.checkLengthIndexable(fileLength)) {
errorCode = activities.EXCLUDED_LENGTH;
errorDesc = "Excluded because of document length ("
+ fileLength + ")";
activities.noDocument(documentIdentifier, versionString);
continue;
}
String documentURI = getDocumentURI(s3Artifact);
Logging.connectors.debug("document : " + documentURI);
if (!activities.checkURLIndexable(documentURI)) {
errorCode = activities.EXCLUDED_URL;
errorDesc = "Excluded because of URL ('" + documentURI
+ "')";
activities.noDocument(documentIdentifier, versionString);
continue;
}
if (!activities.checkMimeTypeIndexable(mimeType)) {
errorCode = activities.EXCLUDED_MIMETYPE;
errorDesc = "Excluded because of mime type ('" + mimeType
+ "')";
activities.noDocument(documentIdentifier, versionString);
continue;
}
if (!activities.checkDateIndexable(lastModified)) {
errorCode = activities.EXCLUDED_DATE;
errorDesc = "Excluded because of date (" + lastModified
+ ")";
activities.noDocument(documentIdentifier, versionString);
continue;
}
InputStream in = null;
try {
in = s3Obj.getObjectContent();
// otherwise process
RepositoryDocument rd = new RepositoryDocument();
addRawMetadata(rd, objectMetadata);
// Turn into acls and add into
// description
String[] denyAclsToUse;
if (aclsToUse.length > 0)
denyAclsToUse = new String[] { AmazonS3Connector.GLOBAL_DENY_TOKEN };
else
denyAclsToUse = new String[0];
rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,
aclsToUse, denyAclsToUse);
rd.setMimeType(mimeType);
if (lastModified != null)
rd.setModifiedDate(lastModified);
// assign the stream
rd.setBinary(in, fileLength);
activities.ingestDocumentWithException(documentIdentifier,
versionString, documentURI, rd);
errorCode = "OK";
fileSize = new Long(fileLength);
}
catch (IOException e1) {
handleIOException(e1);
}
finally {
// close input stream
if (in != null)
IOUtils.closeQuietly(in);
}
}
catch (AmazonServiceException e) {
handleServiceException(e);
}
catch (AmazonClientException e) {
handleClientException(e);
}
}
}
protected static void handleIOException(final IOException e)
throws ManifoldCFException, ServiceInterruption {
Logging.connectors.error("Error while copying stream", e);
if (!(e instanceof java.net.SocketTimeoutException)
&& (e instanceof InterruptedIOException)) {
throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,
ManifoldCFException.INTERRUPTED);
}
long currentTime = System.currentTimeMillis();
throw new ServiceInterruption("IO exception: " + e.getMessage(), e,
currentTime + 300000L, currentTime + 3 * 60 * 60000L, -1, false);
}
protected static void handleServiceException(final AmazonServiceException e)
throws ManifoldCFException, ServiceInterruption {
Logging.connectors.error("Service exception status : " + e.getStatusCode(),e);
if (!e.isRetryable()) {
throw new ManifoldCFException("Amazon service exception: "
+ e.getMessage(), e.getCause());
}
throw new ServiceInterruption(e.getMessage(), System.currentTimeMillis()+300000L);
}
protected static void handleClientException(final AmazonClientException e)
throws ManifoldCFException, ServiceInterruption {
Logging.connectors.error(e);
if (!e.isRetryable()) {
throw new ManifoldCFException("Amazon client exception: "
+ e.getMessage(), e.getCause());
}
throw new ServiceInterruption(e.getMessage(), System.currentTimeMillis()+300000L);
}
}