/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.manifoldcf.crawler.connectors.alfrescowebscript; import com.github.maoo.indexer.client.AlfrescoClient; import com.github.maoo.indexer.client.AlfrescoDownException; import com.github.maoo.indexer.client.AlfrescoResponse; import com.github.maoo.indexer.client.WebScriptsAlfrescoClient; import org.apache.manifoldcf.agents.interfaces.RepositoryDocument; import org.apache.manifoldcf.agents.interfaces.ServiceInterruption; import org.apache.manifoldcf.core.interfaces.*; import org.apache.manifoldcf.core.common.DateParser; import org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnector; import org.apache.manifoldcf.crawler.interfaces.IExistingVersions; import org.apache.manifoldcf.crawler.interfaces.IProcessActivity; import org.apache.manifoldcf.crawler.interfaces.ISeedingActivity; import org.apache.manifoldcf.crawler.system.Logging; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InterruptedIOException; import java.io.InputStream; import java.text.MessageFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.*; public class AlfrescoConnector extends BaseRepositoryConnector { private static final String ACTIVITY_FETCH = "fetch document"; private static final String[] activitiesList = new String[]{ACTIVITY_FETCH}; private AlfrescoClient alfrescoClient; private String binName; private static final String CONTENT_URL_PROPERTY = "contentUrlPath"; private static final String AUTHORITIES_PROPERTY = "readableAuthorities"; private static final String MIMETYPE_PROPERTY = "mimetype"; private static final String SIZE_PROPERTY = "size"; private static final String MODIFIED_DATE_PROPERTY = "cm:modified"; private static final String CREATED_DATE_PROPERTY = "cm:created"; // Static Fields private static final String FIELD_UUID = "uuid"; private static final String FIELD_NODEREF = "nodeRef"; private static final String FIELD_TYPE = "type"; private static final String FIELD_NAME = "name"; @Override public int getConnectorModel() { return MODEL_ADD_CHANGE_DELETE; // We return only incremental documents. } public void setClient(AlfrescoClient client) { alfrescoClient = client; } @Override public String[] getBinNames(String documentIdentifier) { return new String[] { binName }; } @Override public void connect(ConfigParams config) { super.connect(config); String protocol = getConfig(config, "protocol", "http"); String hostname = getConfig(config, "hostname", "localhost"); String port = getConfig(config, "port", "8080"); String endpoint = getConfig(config, "endpoint", "/alfresco/service"); String storeProtocol = getConfig(config, "storeprotocol", "workspace"); String storeId = getConfig(config, "storeid", "SpacesStore"); String username = getConfig(config, "username", null); String password = getObfuscatedConfig(config, "password", null); /* System.out.println("============"); System.out.println(protocol); System.out.println(hostname); System.out.println(port); System.out.println(endpoint); System.out.println(storeProtocol); System.out.println(storeId); System.out.println(username); System.out.println(password); System.out.println("============"); */ alfrescoClient = new WebScriptsAlfrescoClient(protocol, hostname + ":" + port, endpoint, storeProtocol, storeId, username, password); binName = hostname; } private static String getConfig(ConfigParams config, String parameter, String defaultValue) { final String protocol = config.getParameter(parameter); if (protocol == null) { return defaultValue; } return protocol; } private static String getObfuscatedConfig(ConfigParams config, String parameter, String defaultValue) { final String protocol = config.getObfuscatedParameter(parameter); if (protocol == null) { return defaultValue; } return protocol; } @Override public String check() throws ManifoldCFException { try { // We really want to do something more like fetching a document here... alfrescoClient.fetchUserAuthorities("admin"); return super.check(); } catch (AlfrescoDownException e) { if (Logging.connectors != null) { Logging.connectors.warn(e.getMessage(), e); } return "Alfresco connection check failed: " + e.getMessage(); } catch (Exception e) { if (Logging.connectors != null) { Logging.connectors.error(e.getMessage(), e); } throw new ManifoldCFException("Alfresco connection check failed",e); } } @Override public void disconnect() throws ManifoldCFException { alfrescoClient = null; binName = null; super.disconnect(); } @Override public String[] getActivitiesList() { return activitiesList; } @Override public int getMaxDocumentRequest() { return 20; } @Override public String addSeedDocuments(ISeedingActivity activities, Specification spec, String lastSeedVersion, long seedTime, int jobMode) throws ManifoldCFException, ServiceInterruption { try { long lastTransactionId = 0; long lastAclChangesetId = 0; if(lastSeedVersion != null && !lastSeedVersion.isEmpty()) { StringTokenizer tokenizer = new StringTokenizer(lastSeedVersion,"|"); if (tokenizer.countTokens() == 2) { lastTransactionId = new Long(tokenizer.nextToken()); lastAclChangesetId = new Long(tokenizer.nextToken()); } } if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) Logging.connectors.debug(new MessageFormat("Starting from transaction id: {0} and acl changeset id: {1}", Locale.ROOT) .format(new Object[]{lastTransactionId, lastAclChangesetId})); long transactionIdsProcessed; long aclChangesetsProcessed; do { final AlfrescoResponse response = alfrescoClient. fetchNodes(lastTransactionId, lastAclChangesetId, ConfigurationHandler.getFilters(spec)); int count = 0; for (Map<String, Object> doc : response.getDocuments()) { // String json = gson.toJson(doc); // activities.addSeedDocument(json); String uuid = doc.get("uuid").toString(); activities.addSeedDocument(uuid); count++; } if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) Logging.connectors.debug(new MessageFormat("Fetched and added {0} seed documents", Locale.ROOT) .format(new Object[]{new Integer(count)})); transactionIdsProcessed = response.getLastTransactionId() - lastTransactionId; aclChangesetsProcessed = response.getLastAclChangesetId() - lastAclChangesetId; lastTransactionId = response.getLastTransactionId(); lastAclChangesetId = response.getLastAclChangesetId(); if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) Logging.connectors.debug(new MessageFormat("transaction_id={0}, acl_changeset_id={1}", Locale.ROOT) .format(new Object[]{lastTransactionId, lastAclChangesetId})); } while (transactionIdsProcessed > 0 || aclChangesetsProcessed > 0); if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) Logging.connectors.debug(new MessageFormat("Recording {0} as last transaction id and {1} as last changeset id", Locale.ROOT) .format(new Object[]{lastTransactionId, lastAclChangesetId})); return lastTransactionId + "|" + lastAclChangesetId; } catch (AlfrescoDownException e) { handleAlfrescoDownException(e,"seeding"); return null; } } @Override public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec, IProcessActivity activities, int jobMode, boolean usesDefaultAuthority) throws ManifoldCFException, ServiceInterruption { boolean enableDocumentProcessing = ConfigurationHandler.getEnableDocumentProcessing(spec); for (String doc : documentIdentifiers) { String errorCode = null; String errorDesc = null; Long fileLengthLong = null; long startTime = System.currentTimeMillis(); try { String nextVersion = statuses.getIndexedVersionString(doc); // Calling again Alfresco API because Document's actions are lost from seeding method AlfrescoResponse response = alfrescoClient.fetchNode(doc); if(response.getDocumentList().isEmpty()){ // Not found seeded document. Could reflect an error in Alfresco if (Logging.connectors != null) Logging.connectors.warn(new MessageFormat("Invalid Seeded Document from Alfresco with ID {0}", Locale.ROOT) .format(new Object[]{doc})); activities.deleteDocument(doc); continue; } Map<String, Object> map = response.getDocumentList().get(0); // Should be only one if ((Boolean) map.get("deleted")) { activities.deleteDocument(doc); continue; } // From the map, get the things we know about String uuid = doc; String nodeRef = map.containsKey(FIELD_NODEREF) ? map.get(FIELD_NODEREF).toString() : ""; String type = map.containsKey(FIELD_TYPE) ? map.get(FIELD_TYPE).toString() : ""; String name = map.containsKey(FIELD_NAME) ? map.get(FIELD_NAME).toString() : ""; // Fetch document metadata Map<String,Object> properties = alfrescoClient.fetchMetadata(uuid); // Process various special fields Object mdObject; // Size Long lSize = null; mdObject = properties.get(SIZE_PROPERTY); if (mdObject != null) { String size = mdObject.toString(); lSize = new Long(size); } // Modified Date Date modifiedDate = null; mdObject = properties.get(MODIFIED_DATE_PROPERTY); if (mdObject != null) { modifiedDate = DateParser.parseISO8601Date(mdObject.toString()); } // Created Date Date createdDate = null; mdObject = properties.get(CREATED_DATE_PROPERTY); if (mdObject != null) { createdDate = DateParser.parseISO8601Date(mdObject.toString()); } // Establish the document version. if (modifiedDate == null) { activities.deleteDocument(doc); continue; } StringBuilder sb = new StringBuilder(); sb.append((enableDocumentProcessing?"+":"-")); sb.append(new Long(modifiedDate.getTime()).toString()); @SuppressWarnings("unchecked") List<String> permissions = (List<String>) properties.remove(AUTHORITIES_PROPERTY); if(permissions != null){ for (String permission : permissions) { sb.append(permission); } } String documentVersion = sb.toString(); if(!activities.checkDocumentNeedsReindexing(doc, documentVersion)) continue; String mimeType = null; Object mimetypeObject = properties.get(MIMETYPE_PROPERTY); if (mimetypeObject != null) { mimeType = mimetypeObject.toString(); } if (lSize != null && !activities.checkLengthIndexable(lSize.longValue())) { activities.noDocument(doc, documentVersion); errorCode = activities.EXCLUDED_LENGTH; errorDesc = "Excluding document because of length ("+lSize+")"; continue; } if (!activities.checkMimeTypeIndexable(mimeType)) { activities.noDocument(doc, documentVersion); errorCode = activities.EXCLUDED_MIMETYPE; errorDesc = "Excluding document because of mime type ("+mimeType+")"; continue; } if (!activities.checkDateIndexable(modifiedDate)) { activities.noDocument(doc, documentVersion); errorCode = activities.EXCLUDED_DATE; errorDesc = "Excluding document because of date ("+modifiedDate+")"; continue; } String contentUrlPath = (String) properties.get(CONTENT_URL_PROPERTY); if (contentUrlPath == null || contentUrlPath.isEmpty()) { activities.noDocument(doc, documentVersion); errorCode = "NOURL"; errorDesc = "Excluding document because no URL found"; continue; } if (!activities.checkURLIndexable(contentUrlPath)) { activities.noDocument(doc, documentVersion); errorCode = activities.EXCLUDED_URL; errorDesc = "Excluding document because of URL ('"+contentUrlPath+"')"; continue; } RepositoryDocument rd = new RepositoryDocument(); rd.addField(FIELD_NODEREF, nodeRef); rd.addField(FIELD_TYPE, type); rd.setFileName(name); if (modifiedDate != null) rd.setModifiedDate(modifiedDate); if (createdDate != null) rd.setCreatedDate(createdDate); for(String property : properties.keySet()) { Object propertyValue = properties.get(property); rd.addField(property,propertyValue.toString()); } if (mimeType != null && !mimeType.isEmpty()) rd.setMimeType(mimeType); // Indexing Permissions if(permissions != null){ rd.setSecurityACL(RepositoryDocument.SECURITY_TYPE_DOCUMENT, permissions.toArray(new String[permissions.size()])); } // Document Binary Content InputStream stream; long length; byte[] empty = new byte[0]; if (enableDocumentProcessing) { if (lSize != null) { stream = alfrescoClient.fetchContent(contentUrlPath); if (stream == null) { activities.noDocument(doc, documentVersion); errorCode = "NOSTREAM"; errorDesc = "Excluding document because no content stream found"; continue; } length = lSize.longValue(); } else { stream = new ByteArrayInputStream(empty); length = 0L; } } else { stream = new ByteArrayInputStream(empty); length = 0L; } try { rd.setBinary(stream, length); if (Logging.connectors != null && Logging.connectors.isDebugEnabled()) Logging.connectors.debug(new MessageFormat("Ingesting with id: {0}, URI {1} and rd {2}", Locale.ROOT) .format(new Object[]{uuid, nodeRef, rd.getFileName()})); activities.ingestDocumentWithException(doc, documentVersion, contentUrlPath, rd); errorCode = "OK"; fileLengthLong = new Long(length); } catch (IOException e) { handleIOException(e,"reading stream"); } finally { try { stream.close(); } catch (IOException e) { handleIOException(e,"closing stream"); } } } catch (AlfrescoDownException e) { handleAlfrescoDownException(e,"processing"); } catch (ManifoldCFException e) { if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) errorCode = null; throw e; } finally { if (errorCode != null) activities.recordActivity(new Long(startTime), ACTIVITY_FETCH, fileLengthLong, doc, errorCode, errorDesc, null); } } } protected final static long interruptionRetryTime = 5L*60L*1000L; protected static void handleAlfrescoDownException(AlfrescoDownException e, String context) throws ManifoldCFException, ServiceInterruption { long currentTime = System.currentTimeMillis(); // Server doesn't appear to by up. Try for a brief time then give up. String message = "Server appears down during "+context+": "+e.getMessage(); Logging.connectors.warn(message,e); throw new ServiceInterruption(message, e, currentTime + interruptionRetryTime, -1L, 3, true); } protected static void handleIOException(IOException e, String context) throws ManifoldCFException, ServiceInterruption { if ((e instanceof InterruptedIOException) && (!(e instanceof java.net.SocketTimeoutException))) throw new ManifoldCFException(e.getMessage(), ManifoldCFException.INTERRUPTED); long currentTime = System.currentTimeMillis(); if (e instanceof java.net.ConnectException) { // Server isn't up at all. Try for a brief time then give up. String message = "Server could not be contacted during "+context+": "+e.getMessage(); Logging.connectors.warn(message,e); throw new ServiceInterruption(message, e, currentTime + interruptionRetryTime, -1L, 3, true); } if (e instanceof java.net.SocketTimeoutException) { String message2 = "Socket timeout exception during "+context+": "+e.getMessage(); Logging.connectors.warn(message2,e); throw new ServiceInterruption(message2, e, currentTime + interruptionRetryTime, currentTime + 20L * 60000L, -1, false); } if (e.getClass().getName().equals("java.net.SocketException")) { // In the past we would have treated this as a straight document rejection, and // treated it in the same manner as a 400. The reasoning is that the server can // perfectly legally send out a 400 and drop the connection immediately thereafter, // this a race condition. // However, Solr 4.0 (or the Jetty version that the example runs on) seems // to have a bug where it drops the connection when two simultaneous documents come in // at the same time. This is the final version of Solr 4.0 so we need to deal with // this. if (e.getMessage().toLowerCase(Locale.ROOT).indexOf("broken pipe") != -1 || e.getMessage().toLowerCase(Locale.ROOT).indexOf("connection reset") != -1 || e.getMessage().toLowerCase(Locale.ROOT).indexOf("target server failed to respond") != -1) { // Treat it as a service interruption, but with a limited number of retries. // In that way we won't burden the user with a huge retry interval; it should // give up fairly quickly, and yet NOT give up if the error was merely transient String message = "Server dropped connection during "+context+": "+e.getMessage(); Logging.connectors.warn(message,e); throw new ServiceInterruption(message, e, currentTime + interruptionRetryTime, -1L, 3, false); } // Other socket exceptions are service interruptions - but if we keep getting them, it means // that a socket timeout is probably set too low to accept this particular document. So // we retry for a while, then skip the document. String message2 = "Socket exception during "+context+": "+e.getMessage(); Logging.connectors.warn(message2,e); throw new ServiceInterruption(message2, e, currentTime + interruptionRetryTime, currentTime + 20L * 60000L, -1, false); } // Otherwise, no idea what the trouble is, so presume that retries might fix it. String message3 = "IO exception during "+context+": "+e.getMessage(); Logging.connectors.warn(message3,e); throw new ServiceInterruption(message3, e, currentTime + interruptionRetryTime, currentTime + 2L * 60L * 60000L, -1, true); } @Override public void outputConfigurationHeader(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters, List<String> tabsArray) throws ManifoldCFException, IOException { ConfigurationHandler.outputConfigurationHeader(threadContext, out, locale, parameters, tabsArray); } @Override public void outputConfigurationBody(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters, String tabName) throws ManifoldCFException, IOException { ConfigurationHandler.outputConfigurationBody(threadContext, out, locale, parameters, tabName); } @Override public String processConfigurationPost(IThreadContext threadContext, IPostParameters variableContext, Locale locale, ConfigParams parameters) throws ManifoldCFException { return ConfigurationHandler.processConfigurationPost(threadContext, variableContext, locale, parameters); } @Override public void viewConfiguration(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters) throws ManifoldCFException, IOException { ConfigurationHandler.viewConfiguration(threadContext, out, locale, parameters); } @Override public void outputSpecificationHeader(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber, List<String> tabsArray) throws ManifoldCFException, IOException { ConfigurationHandler.outputSpecificationHeader(out, locale, os, connectionSequenceNumber, tabsArray); } @Override public void outputSpecificationBody(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber, int actualSequenceNumber, String tabName) throws ManifoldCFException, IOException { ConfigurationHandler.outputSpecificationBody(out, locale, os, connectionSequenceNumber, actualSequenceNumber, tabName); } @Override public String processSpecificationPost(IPostParameters variableContext, Locale locale, Specification os, int connectionSequenceNumber) throws ManifoldCFException { return ConfigurationHandler.processSpecificationPost(variableContext, locale, os, connectionSequenceNumber); } @Override public void viewSpecification(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber) throws ManifoldCFException, IOException { ConfigurationHandler.viewSpecification(out, locale, os, connectionSequenceNumber); } }