/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.manifoldcf.agents.output.kafka; import org.apache.manifoldcf.core.interfaces.*; import org.apache.manifoldcf.agents.interfaces.*; import java.util.*; import java.io.*; import java.util.Properties; import java.util.concurrent.ExecutionException; import org.apache.kafka.clients.producer.KafkaProducer; import org.apache.kafka.clients.producer.ProducerConfig; import org.apache.kafka.clients.producer.ProducerRecord; import org.apache.kafka.common.PartitionInfo; /** * This is a kafka output connector. */ public class KafkaOutputConnector extends org.apache.manifoldcf.agents.output.BaseOutputConnector { public static final String _rcsid = "@(#)$Id: KafkaOutputConnector.java 988245 2010-08-23 18:39:35Z kwright $"; // Activities we log /** * Ingestion activity */ public final static String INGEST_ACTIVITY = "document ingest"; /** * Job notify activity */ public final static String JOB_COMPLETE_ACTIVITY = "output notification"; private final static String KAFKA_TAB_PARAMETERS = "KafkaConnector.Parameters"; /** * Forward to the javascript to check the configuration parameters */ private static final String EDIT_CONFIG_HEADER_FORWARD = "editConfiguration.js"; /** * Forward to the HTML template to edit the configuration parameters */ private static final String EDIT_CONFIG_FORWARD_PARAMETERS = "editConfiguration_Parameters.html"; /** * Forward to the HTML template to view the configuration parameters */ private static final String VIEW_CONFIG_FORWARD = "viewConfiguration.html"; /** * cloudsearch field name for file body text. */ private static final String FILE_BODY_TEXT_FIELDNAME = "f_bodytext"; /** * Field name we use for document's URI. */ private static final String DOCUMENT_URI_FIELDNAME = "document_URI"; /** * The allow attribute name */ protected final static String allowAttributeName = "allow_token_"; /** * The deny attribute name */ protected final static String denyAttributeName = "deny_token_"; /** * The no-security token */ protected final static String noSecurityToken = "__nosecurity__"; protected final static boolean useNullValue = false; KafkaProducer producer = null; /** * Constructor. */ public KafkaOutputConnector() { } public void setProducer(KafkaProducer producer) { this.producer = producer; } /** * Return the list of activities that this connector supports (i.e. writes * into the log). * * @return the list. */ @Override public String[] getActivitiesList() { return new String[]{INGEST_ACTIVITY, JOB_COMPLETE_ACTIVITY}; } /** * Connect. * * @param configParameters is the set of configuration parameters, which in * this case describe the target appliance, basic auth configuration, etc. * (This formerly came out of the ini file.) */ @Override public void connect(ConfigParams configParameters) { super.connect(configParameters); Properties props = new Properties(); String IP = params.getParameter(KafkaConfig.IP); String PORT = params.getParameter(KafkaConfig.PORT); //System.out.println("Kafka IP: " + IP); //System.out.println("Kafka Port: " + PORT); props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, IP + ":" + PORT); props.put(ProducerConfig.RETRIES_CONFIG, "3"); props.put(ProducerConfig.ACKS_CONFIG, "all"); props.put(ProducerConfig.COMPRESSION_TYPE_CONFIG, "none"); props.put(ProducerConfig.BATCH_SIZE_CONFIG, 200); props.put(ProducerConfig.BLOCK_ON_BUFFER_FULL_CONFIG, true); props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.ByteArraySerializer"); producer = new KafkaProducer(props); } /** * Close the connection. Call this before discarding the connection. */ @Override public void disconnect() throws ManifoldCFException { super.disconnect(); } /** * Fill in a Server tab configuration parameter map for calling a Velocity * template. * * @param newMap is the map to fill in * @param parameters is the current set of configuration parameters */ private static void fillInServerConfigurationMap(Map<String, Object> newMap, IPasswordMapperActivity mapper, ConfigParams parameters) { String IP = parameters.getParameter(KafkaConfig.IP); String port = parameters.getParameter(KafkaConfig.PORT); String topic = parameters.getParameter(KafkaConfig.TOPIC); if (IP == null) { IP = "localhost"; } if (port == null) { port = "9092"; } if (topic == null) { topic = "topic"; } newMap.put("IP", IP); newMap.put("PORT", port); newMap.put("TOPIC", topic); } @Override public void outputConfigurationHeader(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters, List<String> tabsArray) throws ManifoldCFException, IOException { // Add the Server tab tabsArray.add(Messages.getString(locale, KAFKA_TAB_PARAMETERS)); // Map the parameters Map<String, Object> paramMap = new HashMap<String, Object>(); // Fill in the parameters from each tab fillInServerConfigurationMap(paramMap, out, parameters); // Output the Javascript - only one Velocity template for all tabs Messages.outputResourceWithVelocity(out, locale, EDIT_CONFIG_HEADER_FORWARD, paramMap); } @Override public void outputConfigurationBody(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters, String tabName) throws ManifoldCFException, IOException { // Call the Velocity templates for each tab Map<String, Object> paramMap = new HashMap<String, Object>(); // Set the tab name paramMap.put("TABNAME", tabName); // Fill in the parameters fillInServerConfigurationMap(paramMap, out, parameters); // Server tab Messages.outputResourceWithVelocity(out, locale, EDIT_CONFIG_FORWARD_PARAMETERS, paramMap); } @Override public void viewConfiguration(IThreadContext threadContext, IHTTPOutput out, Locale locale, ConfigParams parameters) throws ManifoldCFException, IOException { Map<String, Object> paramMap = new HashMap<String, Object>(); // Fill in map from each tab fillInServerConfigurationMap(paramMap, out, parameters); Messages.outputResourceWithVelocity(out, locale, VIEW_CONFIG_FORWARD, paramMap); } @Override public String processConfigurationPost(IThreadContext threadContext, IPostParameters variableContext, ConfigParams parameters) throws ManifoldCFException { // Server tab parameters String IP = variableContext.getParameter(KafkaConfig.IP); if (IP != null) { parameters.setParameter(KafkaConfig.IP, IP); } String port = variableContext.getParameter(KafkaConfig.PORT); if (port != null) { parameters.setParameter(KafkaConfig.PORT, port); } String topic = variableContext.getParameter(KafkaConfig.TOPIC); if (topic != null) { parameters.setParameter(KafkaConfig.TOPIC, topic); } return null; } /** * Test the connection. Returns a string describing the connection integrity. * * @return the connection's status as a displayable string. */ @Override public String check() throws ManifoldCFException { try { List<PartitionInfo> partitions = producer.partitionsFor(params.getParameter(KafkaConfig.TOPIC)); return super.check(); } catch (ManifoldCFException e) { return "Connection failed: " + e.getMessage(); } } /** * Get an output version string, given an output specification. The output * version string is used to uniquely describe the pertinent details of the * output specification and the configuration, to allow the Connector * Framework to determine whether a document will need to be output again. * Note that the contents of the document cannot be considered by this method, * and that a different version string (defined in IRepositoryConnector) is * used to describe the version of the actual document. * * This method presumes that the connector object has been configured, and it * is thus able to communicate with the output data store should that be * necessary. * * @param spec is the current output specification for the job that is doing * the crawling. * @return a string, of unlimited length, which uniquely describes output * configuration and specification in such a way that if two such strings are * equal, the document will not need to be sent again to the output data * sstore. */ @Override public VersionContext getPipelineDescription(Specification spec) throws ManifoldCFException, ServiceInterruption { return new VersionContext("", params, spec); } /** * Add (or replace) a document in the output data store using the connector. * This method presumes that the connector object has been configured, and it * is thus able to communicate with the output data store should that be * necessary. * * @param documentURI is the URI of the document. The URI is presumed to be * the unique identifier which the output data store will use to process and * serve the document. This URI is constructed by the repository connector * which fetches the document, and is thus universal across all output * connectors. * @param pipelineDescription includes the description string that was * constructed for this document by the getOutputDescription() method. * @param document is the document data to be processed (handed to the output * data store). * @param authorityNameString is the name of the authority responsible for * authorizing any access tokens passed in with the repository document. May * be null. * @param activities is the handle to an object that the implementer of a * pipeline connector may use to perform operations, such as logging * processing activity, or sending a modified document to the next stage in * the pipeline. * @return the document status (accepted or permanently rejected). * @throws IOException only if there's a stream error reading the document * data. */ @Override public int addOrReplaceDocumentWithException(String documentURI, VersionContext outputDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities) throws ManifoldCFException, ServiceInterruption, IOException { //System.out.println("Starting to ingest document...."); try { KafkaMessage kafkaMessage = new KafkaMessage(); // Get document info in JSON format byte[] finalString = kafkaMessage.createJSON(document); String topic = getConfig(params, KafkaConfig.TOPIC, "topic"); ProducerRecord record = new ProducerRecord(topic, finalString); producer.send(record).get(); } catch (InterruptedException e) { new ManifoldCFException("interrupted", ManifoldCFException.INTERRUPTED); } catch (ExecutionException e) { new ManifoldCFException("interrupted", ManifoldCFException.INTERRUPTED); } activities.recordActivity(null, INGEST_ACTIVITY, new Long(document.getBinaryLength()), documentURI, "OK", null); return DOCUMENTSTATUS_ACCEPTED; } private static String getConfig(ConfigParams config, String parameter, String defaultValue) { if (config == null) { return defaultValue; } final String protocol = config.getParameter(parameter); if (protocol == null) { return defaultValue; } return protocol; } /** * Notify the connector of a completed job. This is meant to allow the * connector to flush any internal data structures it has been keeping around, * or to tell the output repository that this is a good time to synchronize * things. It is called whenever a job is either completed or aborted. * * @param activities is the handle to an object that the implementer of an * output connector may use to perform operations, such as logging processing * activity. */ @Override public void noteJobComplete(IOutputNotifyActivity activities) throws ManifoldCFException, ServiceInterruption { activities.recordActivity(null, JOB_COMPLETE_ACTIVITY, null, "", "OK", null); } }