package com.mongodb.hadoop;
import com.mongodb.BasicDBObject;
import com.mongodb.Block;
import com.mongodb.MongoClient;
import com.mongodb.client.gridfs.GridFSBucket;
import com.mongodb.client.gridfs.GridFSBuckets;
import com.mongodb.client.gridfs.GridFSUploadStream;
import com.mongodb.client.gridfs.model.GridFSFile;
import com.mongodb.client.gridfs.model.GridFSUploadOptions;
import com.mongodb.hadoop.testutils.BaseHadoopTest;
import com.mongodb.hadoop.util.MongoConfigUtil;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.bson.Document;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Pattern;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
public class GridFSInputFormatTest extends BaseHadoopTest {
private static MongoClient client = new MongoClient();
private static GridFSInputFormat inputFormat =
new GridFSInputFormat();
private static String[] readmeSections;
private static GridFSBucket bucket = GridFSBuckets.create(
client.getDatabase("mongo_hadoop"));
private static StringBuilder fileContents;
private static GridFSFile readme;
private static GridFSFile bson;
private static void uploadFile(final File file)
throws IOException {
// Set a small chunks size so we get multiple chunks per readme.
GridFSUploadStream gridfsStream = bucket.openUploadStream(
file.getName(), new GridFSUploadOptions().chunkSizeBytes(1024));
IOUtils.copy(new FileInputStream(file), gridfsStream);
gridfsStream.close();
}
private static void cleanFile(final String filename) {
bucket.find(new Document("filename", filename)).forEach(
new Block<GridFSFile>() {
@Override
public void apply(final GridFSFile gridFSFile) {
bucket.delete(gridFSFile.getObjectId());
}
}
);
}
@BeforeClass
public static void setUpClass() throws IOException, URISyntaxException {
// Clean up files and re-upload them.
cleanFile("README.md");
cleanFile("orders.bson");
File bsonFile = new File(GridFSInputFormatTest.class.getResource(
"/bookstore-dump/orders.bson").toURI().getPath());
uploadFile(bsonFile);
File readmeFile = new File(PROJECT_HOME, "README.md");
uploadFile(readmeFile);
// Read the README, preparing to count sections and upload to GridFS.
fileContents = new StringBuilder();
BufferedReader reader = new BufferedReader(new FileReader(readmeFile));
int charsRead;
do {
char[] buff = new char[1024];
charsRead = reader.read(buff);
if (charsRead > 0) {
fileContents.append(buff, 0, charsRead);
}
} while (charsRead > 0);
// Count number of sections in the README ("## ...").
readmeSections = Pattern.compile("#+").split(fileContents);
readme = bucket.find(new Document("filename", "README.md")).first();
bson = bucket.find(new Document("filename", "orders.bson")).first();
}
@AfterClass
public static void tearDownClass() {
cleanFile("README.md");
cleanFile("orders.bson");
}
private static Configuration getConfiguration() {
Configuration conf = new Configuration();
MongoConfigUtil.setInputURI(
conf, "mongodb://localhost:27017/mongo_hadoop.fs");
MongoConfigUtil.setQuery(
conf, new BasicDBObject("filename", "README.md"));
return conf;
}
private static JobContext mockJobContext(final Configuration conf) {
JobContext context = mock(JobContext.class);
when(context.getConfiguration()).thenReturn(conf);
return context;
}
private static TaskAttemptContext mockTaskAttemptContext(
final Configuration conf) {
TaskAttemptContext context = mock(TaskAttemptContext.class);
when(context.getConfiguration()).thenReturn(conf);
return context;
}
private List<InputSplit> getSplits()
throws IOException, InterruptedException {
JobContext context = mock(JobContext.class);
when(context.getConfiguration()).thenReturn(getConfiguration());
return inputFormat.getSplits(context);
}
@Test
public void testGetSplits() throws IOException, InterruptedException {
assertEquals(
(int) Math.ceil(
readme.getLength() / (float) readme.getChunkSize()),
getSplits().size());
}
@Test
public void testRecordReader() throws IOException, InterruptedException {
List<InputSplit> splits = getSplits();
Configuration conf = getConfiguration();
// Split README by sections in Markdown.
MongoConfigUtil.setGridFSDelimiterPattern(conf, "#+");
TaskAttemptContext context = mockTaskAttemptContext(conf);
List<String> sections = new ArrayList<String>();
for (InputSplit split : splits) {
RecordReader reader = new GridFSInputFormat.GridFSTextRecordReader();
reader.initialize(split, context);
while (reader.nextKeyValue()) {
sections.add(reader.getCurrentValue().toString());
}
}
assertEquals(Arrays.asList(readmeSections), sections);
}
@Test
public void testRecordReaderNoDelimiter()
throws IOException, InterruptedException {
List<InputSplit> splits = getSplits();
Configuration conf = getConfiguration();
// Empty delimiter == no delimiter.
MongoConfigUtil.setGridFSDelimiterPattern(conf, "");
TaskAttemptContext context = mockTaskAttemptContext(conf);
StringBuilder fileText = new StringBuilder();
for (InputSplit split : splits) {
GridFSInputFormat.GridFSTextRecordReader reader =
new GridFSInputFormat.GridFSTextRecordReader();
reader.initialize(split, context);
while (reader.nextKeyValue()) {
fileText.append(reader.getCurrentValue().toString());
}
}
assertEquals(fileContents.toString(), fileText.toString());
}
@Test
public void testReadWholeFile() throws IOException, InterruptedException {
Configuration conf = getConfiguration();
MongoConfigUtil.setGridFSWholeFileSplit(conf, true);
JobContext jobContext = mockJobContext(conf);
List<InputSplit> splits = inputFormat.getSplits(jobContext);
// Empty delimiter == no delimiter.
MongoConfigUtil.setGridFSDelimiterPattern(conf, "#+");
TaskAttemptContext context = mockTaskAttemptContext(conf);
assertEquals(1, splits.size());
List<String> sections = new ArrayList<String>();
for (InputSplit split : splits) {
GridFSInputFormat.GridFSTextRecordReader reader =
new GridFSInputFormat.GridFSTextRecordReader();
reader.initialize(split, context);
int i;
for (i = 0; reader.nextKeyValue(); ++i) {
sections.add(reader.getCurrentValue().toString());
}
}
assertEquals(Arrays.asList(readmeSections), sections);
}
@Test
public void testReadWholeFileNoDelimiter()
throws IOException, InterruptedException {
Configuration conf = getConfiguration();
MongoConfigUtil.setGridFSWholeFileSplit(conf, true);
JobContext jobContext = mockJobContext(conf);
List<InputSplit> splits = inputFormat.getSplits(jobContext);
// Empty delimiter == no delimiter.
MongoConfigUtil.setGridFSDelimiterPattern(conf, "");
TaskAttemptContext context = mockTaskAttemptContext(conf);
assertEquals(1, splits.size());
String fileText = null;
for (InputSplit split : splits) {
GridFSInputFormat.GridFSTextRecordReader reader =
new GridFSInputFormat.GridFSTextRecordReader();
reader.initialize(split, context);
int i;
for (i = 0; reader.nextKeyValue(); ++i) {
fileText = reader.getCurrentValue().toString();
}
assertEquals(1, i);
}
assertEquals(fileContents.toString(), fileText);
}
@Test
public void testReadBinaryFiles()
throws IOException, InterruptedException, URISyntaxException {
Configuration conf = getConfiguration();
MongoConfigUtil.setQuery(conf,
new BasicDBObject("filename", "orders.bson"));
MongoConfigUtil.setGridFSWholeFileSplit(conf, true);
MongoConfigUtil.setGridFSReadBinary(conf, true);
JobContext context = mockJobContext(conf);
TaskAttemptContext taskContext = mockTaskAttemptContext(conf);
List<InputSplit> splits = inputFormat.getSplits(context);
assertEquals(1, splits.size());
int i = 0;
byte[] buff = null;
for (InputSplit split : splits) {
GridFSInputFormat.GridFSBinaryRecordReader reader =
new GridFSInputFormat.GridFSBinaryRecordReader();
reader.initialize(split, taskContext);
for (; reader.nextKeyValue(); ++i) {
buff = new byte[reader.getCurrentValue().getLength()];
// BytesWritable.copyBytes does not exist in Hadoop 1.2
System.arraycopy(
reader.getCurrentValue().getBytes(), 0,
buff, 0, buff.length);
}
}
// Only one record to read on the split.
assertEquals(1, i);
assertNotNull(buff);
assertEquals(bson.getLength(), buff.length);
}
}