/*
* Copyright (c) 2014, Cloudera, Inc. All Rights Reserved.
*
* Cloudera, Inc. licenses this file to you under the Apache License,
* Version 2.0 (the "License"). You may not use this file except in
* compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied. See the License for
* the specific language governing permissions and limitations under the
* License.
*/
package com.cloudera.oryx.contrib.flume;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Collection;
import java.util.List;
import org.apache.flume.Channel;
import org.apache.flume.ChannelException;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.Sink;
import org.apache.flume.Transaction;
import org.apache.flume.conf.Configurable;
import org.apache.flume.conf.ConfigurationException;
import org.apache.flume.instrumentation.SinkCounter;
import org.apache.flume.sink.AbstractSink;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.utils.URIBuilder;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
/**
* <p>
* A Flume {@link Sink} implementation that sends events to an instance of Cloudera Oryx's serving
* layer.
* </p>
* <p>
* Events are taken from the {@link Channel} in batches of the configured <tt>batchSize</tt>. The
* events are processed to extract the configured <tt>oryxFields</tt> and the values are transformed
* into CSV records. The records are sent to Oryx in a HTTP POST request.
* </p>
* <p>
* Batch underruns (i.e. batches smaller than the configured <tt>batchSize</tt>) are supported. If
* the channel returns a null event, meaning it is empty, then the batch is immediately sent,
* regardless of size.
* </p>
* <p>
* For more information on Oryx see the projects GitHub: https://github.com/cloudera/oryx
* </p>
*/
public final class OryxEventSink extends AbstractSink implements Configurable {
private static final Logger log = LoggerFactory.getLogger(OryxEventSink.class);
/** The maximum number of events to take from the channel per transaction */
private static final String BATCH_SIZE = "batchSize";
private static final int DEFAULT_BATCH_SIZE = 100;
/** The hostname running the Oryx serving layer instance **/
private static final String ORYX_HOSTNAME = "oryxHostname";
/** The port the Oryx serving layer instance is listening on **/
private static final String ORYX_PORT = "oryxPort";
private static final int ORYX_DEFAULT_PORT = 80;
/** The endpoint path for Oryx's REST API **/
private static final String ORYX_ENDPOINT = "oryxEndpoint";
private static final String ORYX_DEFAULT_ENDPOINT = "/ingest";
/** A {@link OryxEventParser} implementation */
private static final String ORYX_EVENT_PARSER = "oryxEventParser";
/**
* A list of fields to extract from an event and send to Oryx. Multiple <tt>oryxFields</tt> can be
* specified by using a numeric postfix (i.e. exploding an event):
* <ul>
* <li>oryxFields = user,item[,strength]</li>
* <li>oryxFields.0 = user,item0[,strength0]</li>
* <li>oryxFields.1 = user,item1[,strength1]</li>
* <li>oryxFields.2 = user,item2[,strength2]</li>
* </ul>
**/
private static final String ORYX_FIELDS = "oryxFields";
private int batchSize;
private URI oryxUri;
private List<List<String>> oryxFields;
private OryxEventParser eventParser;
private SinkCounter sinkCounter;
private HttpClient client = null;
@Override
public void configure(Context context) {
sinkCounter = new SinkCounter(getName());
batchSize = context.getInteger(BATCH_SIZE, DEFAULT_BATCH_SIZE);
String oryxEndpoint = context.getString(ORYX_ENDPOINT, ORYX_DEFAULT_ENDPOINT);
String oryxHostname = context.getString(ORYX_HOSTNAME);
int oryxPort = context.getInteger(ORYX_PORT, ORYX_DEFAULT_PORT);
Preconditions.checkState(oryxHostname != null, "No Oryx hostname specified");
try {
oryxUri = new URIBuilder().setScheme("http")
.setHost(oryxHostname)
.setPort(oryxPort)
.setPath(oryxEndpoint).build();
} catch (URISyntaxException e) {
throw new ConfigurationException(e);
}
String parserClass = context.getString(ORYX_EVENT_PARSER);
try {
eventParser = OryxEventParser.class.cast(Class.forName(parserClass).getConstructor().newInstance());
} catch (Exception e) {
throw new ConfigurationException("Unable to load Oryx event parser: " + parserClass, e);
}
oryxFields = Lists.newArrayList();
String fields = context.getString(ORYX_FIELDS);
if (fields != null) {
addFields(fields);
}
for (int i = 0;; i++) {
fields = context.getString(ORYX_FIELDS + '.' + i);
if (fields == null) {
break;
}
addFields(fields);
}
Preconditions.checkState(!oryxFields.isEmpty(), "No Oryx fields specified");
if (log.isDebugEnabled()) {
log.debug("Batch size: {}", batchSize);
log.debug("Oryx URI: {}", oryxUri);
log.debug("Event parser: {}", eventParser.getClass().getName());
log.debug("Number of oryxFields: {}", oryxFields.size());
}
}
private void addFields(String fields) {
String[] items = fields.split(",");
if (items.length < 2 || items.length > 3) {
throw new ConfigurationException("Incorrect number of items. " + fields
+ " should be user,item[,strength]");
}
for (int i = 0; i < items.length; i++) {
items[i] = items[i].trim();
}
if (log.isDebugEnabled()) {
log.debug("Adding {}: {}", ORYX_FIELDS, items);
}
oryxFields.add(Lists.newArrayList(items));
}
@Override
public synchronized void start() {
log.info("Starting Oryx sink: {}", getName());
client = new DefaultHttpClient();
sinkCounter.start();
super.start();
}
@Override
public synchronized void stop() {
log.info("Stopping Oryx sink: {}", getName());
sinkCounter.stop();
super.stop();
log.info("Oryx sink {} stopped: {}", getName(), sinkCounter);
}
/**
* Sends the given {@code batch} to Oryx in a HTTP POST request.
* @param batch the batch of records to send to Oryx
*/
private void processBatch(Collection<String> batch) {
if (log.isDebugEnabled()) {
log.debug("Sending batch of {} records to Oryx at {}", batch.size(), oryxUri);
}
StringBuilder sb = new StringBuilder();
for (String record : batch) {
sb.append(record).append('\n');
}
HttpPost post = new HttpPost(oryxUri);
HttpEntity entity = new StringEntity(sb.toString(), ContentType.TEXT_PLAIN);
post.setEntity(entity);
try {
HttpResponse response = client.execute(post);
if (log.isDebugEnabled()) {
log.debug("HTTP response from Oryx: '{}'", response.getStatusLine());
}
EntityUtils.consumeQuietly(response.getEntity());
} catch (IOException e) {
log.error("Unable to POST batch to Oryx", e);
}
}
@Override
public Status process() throws EventDeliveryException {
Status status = Status.READY;
Channel channel = getChannel();
Transaction transaction = channel.getTransaction();
List<String> batch = Lists.newArrayList();
try {
transaction.begin();
for (int i = 0; i < batchSize; i++) {
Event event = channel.take();
if (event == null || batch.size() >= batchSize) {
// underrun if channel is empty
break;
}
eventParser.parseEvent(event, oryxFields, batch);
}
int txSize = batch.size();
if (txSize == 0) {
sinkCounter.incrementBatchEmptyCount();
status = Status.BACKOFF;
if (log.isDebugEnabled()) {
log.debug("Batch is empty. Backing off");
}
} else {
if (txSize >= batchSize) {
// The batch size can be bigger than configured if events are being exploded into
// multiple Oryx records
sinkCounter.incrementBatchCompleteCount();
} else {
sinkCounter.incrementBatchUnderflowCount();
}
processBatch(batch);
sinkCounter.addToEventDrainSuccessCount(txSize);
}
transaction.commit();
} catch (Throwable t) {
transaction.rollback();
if (t instanceof ChannelException) {
log.error("Oryx sink {} unable to get event from channel {}", getName(), channel.getName(), t);
status = Status.BACKOFF;
} else {
throw new EventDeliveryException("Failed to send events", t);
}
} finally {
transaction.close();
}
return status;
}
}