package com.linkedin.camus.etl.kafka;
import static org.hamcrest.core.Is.is;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
import com.linkedin.camus.etl.kafka.coders.FailDecoder;
import com.linkedin.camus.etl.kafka.coders.JsonStringMessageDecoder;
import com.linkedin.camus.etl.kafka.common.SequenceFileRecordWriterProvider;
import com.linkedin.camus.etl.kafka.mapred.EtlInputFormat;
import com.linkedin.camus.etl.kafka.mapred.EtlMultiOutputFormat;
import kafka.javaapi.producer.Producer;
import kafka.producer.KeyedMessage;
import kafka.producer.ProducerConfig;
import kafka.serializer.StringEncoder;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import com.google.gson.Gson;
public class CamusJobTest {
private static final Random RANDOM = new Random();
private static final String BASE_PATH = "/camus";
private static final String DESTINATION_PATH = BASE_PATH + "/destination";
private static final String EXECUTION_BASE_PATH = BASE_PATH + "/execution";
private static final String EXECUTION_HISTORY_PATH = EXECUTION_BASE_PATH + "/history";
private static final String TOPIC_1 = "topic_1";
private static final String TOPIC_2 = "topic_2";
private static final String TOPIC_3 = "topic_3";
private static KafkaCluster cluster;
private static FileSystem fs;
private static Gson gson;
private static Map<String, List<Message>> messagesWritten;
@BeforeClass
public static void beforeClass() throws IOException {
cluster = new KafkaCluster();
fs = FileSystem.get(new Configuration());
gson = new Gson();
// You can't delete messages in Kafka so just writing a set of known messages that can be used for testing
messagesWritten = new HashMap<String, List<Message>>();
messagesWritten.put(TOPIC_1, writeKafka(TOPIC_1, 10));
messagesWritten.put(TOPIC_2, writeKafka(TOPIC_2, 10));
messagesWritten.put(TOPIC_3, writeKafka(TOPIC_3, 10));
}
@AfterClass
public static void afterClass() {
cluster.shutdown();
}
private Properties props;
private CamusJob job;
private TemporaryFolder folder;
private String destinationPath;
@Before
public void before() throws IOException, NoSuchFieldException, IllegalAccessException {
resetCamus();
folder = new TemporaryFolder();
folder.create();
String path = folder.getRoot().getAbsolutePath();
destinationPath = path + DESTINATION_PATH;
props = cluster.getProps();
props.setProperty(EtlMultiOutputFormat.ETL_DESTINATION_PATH, destinationPath);
props.setProperty(CamusJob.ETL_EXECUTION_BASE_PATH, path + EXECUTION_BASE_PATH);
props.setProperty(CamusJob.ETL_EXECUTION_HISTORY_PATH, path + EXECUTION_HISTORY_PATH);
props.setProperty(EtlInputFormat.CAMUS_MESSAGE_DECODER_CLASS, JsonStringMessageDecoder.class.getName());
props.setProperty(EtlMultiOutputFormat.ETL_RECORD_WRITER_PROVIDER_CLASS,
SequenceFileRecordWriterProvider.class.getName());
props.setProperty(EtlMultiOutputFormat.ETL_RUN_TRACKING_POST, Boolean.toString(false));
props.setProperty(CamusJob.KAFKA_CLIENT_NAME, "Camus");
props.setProperty(CamusJob.KAFKA_BROKERS, props.getProperty("metadata.broker.list"));
// Run Map/Reduce tests in process for hadoop2
props.setProperty("mapreduce.framework.name", "local");
// Run M/R for Hadoop1
props.setProperty("mapreduce.jobtracker.address", "local");
job = new CamusJob(props);
}
@After
public void after() throws IOException, NoSuchFieldException, SecurityException, IllegalArgumentException,
IllegalAccessException {
// Delete all camus data
folder.delete();
Field field = EtlMultiOutputFormat.class.getDeclaredField("committer");
field.setAccessible(true);
field.set(null, null);
}
@Test
public void runJob() throws Exception {
job.run();
assertCamusContains(TOPIC_1);
assertCamusContains(TOPIC_2);
assertCamusContains(TOPIC_3);
// Run a second time (no additional messages should be found)
job = new CamusJob(props);
job.run();
assertCamusContains(TOPIC_1);
assertCamusContains(TOPIC_2);
assertCamusContains(TOPIC_3);
}
@Test
public void runJobWithoutErrorsAndFailOnErrors() throws Exception {
props.setProperty(CamusJob.ETL_FAIL_ON_ERRORS, Boolean.TRUE.toString());
job = new CamusJob(props);
runJob();
}
@Test(expected = RuntimeException.class)
public void runJobWithErrorsAndFailOnErrors() throws Exception {
props.setProperty(CamusJob.ETL_FAIL_ON_ERRORS, Boolean.TRUE.toString());
props.setProperty(EtlInputFormat.CAMUS_MESSAGE_DECODER_CLASS, FailDecoder.class.getName());
props.setProperty(CamusJob.ETL_MAX_PERCENT_SKIPPED_OTHER, "100.0");
job = new CamusJob(props);
job.run();
}
private void assertCamusContains(String topic) throws InstantiationException, IllegalAccessException, IOException {
assertCamusContains(topic, messagesWritten.get(topic));
}
private void assertCamusContains(String topic, List<Message> messages) throws InstantiationException,
IllegalAccessException, IOException {
List<Message> readMessages = readMessages(topic);
assertThat(readMessages.size(), is(messages.size()));
assertTrue(readMessages(topic).containsAll(messages));
}
private static List<Message> writeKafka(String topic, int numOfMessages) {
List<Message> messages = new ArrayList<Message>();
List<KeyedMessage<String, String>> kafkaMessages = new ArrayList<KeyedMessage<String, String>>();
for (int i = 0; i < numOfMessages; i++) {
Message msg = new Message(RANDOM.nextInt());
messages.add(msg);
kafkaMessages.add(new KeyedMessage<String, String>(topic, Integer.toString(i), gson.toJson(msg)));
}
Properties producerProps = cluster.getProps();
producerProps.setProperty("serializer.class", StringEncoder.class.getName());
producerProps.setProperty("key.serializer.class", StringEncoder.class.getName());
Producer<String, String> producer = new Producer<String, String>(new ProducerConfig(producerProps));
try {
producer.send(kafkaMessages);
} finally {
producer.close();
}
return messages;
}
private List<Message> readMessages(String topic) throws IOException, InstantiationException, IllegalAccessException {
return readMessages(new Path(destinationPath, topic));
}
private List<Message> readMessages(Path path) throws IOException, InstantiationException, IllegalAccessException {
List<Message> messages = new ArrayList<Message>();
try {
for (FileStatus file : fs.listStatus(path)) {
if (file.isDir()) {
messages.addAll(readMessages(file.getPath()));
} else {
SequenceFile.Reader reader = new SequenceFile.Reader(fs, file.getPath(), new Configuration());
try {
LongWritable key = (LongWritable) reader.getKeyClass().newInstance();
Text value = (Text) reader.getValueClass().newInstance();
while (reader.next(key, value)) {
messages.add(gson.fromJson(value.toString(), Message.class));
}
} finally {
reader.close();
}
}
}
} catch (FileNotFoundException e) {
System.out.println("No camus messages were found in [" + path + "]");
}
return messages;
}
private static class Message {
private int number;
// Used by Gson
public Message() {
}
public Message(int number) {
this.number = number;
}
@Override
public boolean equals(Object obj) {
if (obj == null || !(obj instanceof Message))
return false;
Message other = (Message) obj;
return number == other.number;
}
}
private static void resetCamus() throws NoSuchFieldException, IllegalAccessException {
// The EtlMultiOutputFormat has a static private field called committer which is only created if null. The problem is this
// writes the Camus metadata meaning the first execution of the camus job defines where all committed output goes causing us
// problems if you want to run Camus again using the meta data (i.e. what offsets we processed). Setting it null here forces
// it to re-instantiate the object with the appropriate output path
Field field = EtlMultiOutputFormat.class.getDeclaredField("committer");
field.setAccessible(true);
field.set(null, null);
}
}