package com.twitter.elephantbird.mapreduce;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.io.Files;
import com.google.common.io.LineProcessor;
import com.twitter.elephantbird.util.HadoopCompat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Version;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import com.twitter.elephantbird.lucene.HdfsMergeTool;
import com.twitter.elephantbird.mapreduce.input.LuceneIndexCollectAllRecordReader;
import com.twitter.elephantbird.mapreduce.input.LuceneIndexInputFormat;
import com.twitter.elephantbird.mapreduce.output.LuceneIndexOutputFormat;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
/**
* End-to-end test of {@link LuceneIndexOutputFormat} and {@link LuceneIndexInputFormat}
* <ol>
* <li>Builds three indexes of small documents (text from the Iliad and Macbeth)</li>
* <li>Searches the indexes</li>
* <li>Verifies that the correct results are found</li>
* </ol>
*
* @author Alex Levenson
*/
public class LuceneIndexingIntegrationTest {
public static final Path[] INPUT_PATHS = new Path[]{
new Path("src/test/resources/com/twitter/elephantbird/mapreduce/test_documents1.txt"),
new Path("src/test/resources/com/twitter/elephantbird/mapreduce/test_documents2.txt"),
new Path("src/test/resources/com/twitter/elephantbird/mapreduce/test_documents3.txt")
};
public static final List<String> QUERIES =
Lists.newArrayList("+(macbeth achilles)", "+shield", "+dusty +death.");
public static final Map<Integer, Set<String>> expectedResults;
static {
expectedResults = Maps.newHashMap();
expectedResults.put(1, Sets.newHashSet(
"Then when he had fashioned the shield so great and strong, "
+ "he made a breastplate also that shone brighter than fire.",
"He made the shield in five thicknesses, and with many a wonder did "
+ "his cunning hand enrich it.",
"All round the outermost rim of the shield he set the mighty stream of the river Oceanus.",
"First he shaped the shield so great and strong, adorning it all over and binding "
+ "it round with a gleaming circuit in three layers;"));
expectedResults.put(2, Sets.newHashSet("The way to dusty death. Out, out, brief candle! "
+ "Life's but a walking shadow, a poor player"));
}
public static class IndexOutputFormat extends LuceneIndexOutputFormat<NullWritable, Text> {
private final Document doc;
private final Field textField;
public IndexOutputFormat() {
doc = new Document();
textField = new TextField("text", "", Field.Store.YES);
doc.add(textField);
}
@Override
public Document buildDocument(NullWritable key, Text value) throws IOException {
textField.setStringValue(value.toString());
return doc;
}
@Override
public Analyzer newAnalyzer(Configuration conf) {
return new WhitespaceAnalyzer(Version.LUCENE_40);
}
}
private static class IndexMapper extends Mapper<LongWritable, Text, NullWritable, Text> {
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
context.write(NullWritable.get(), value);
}
}
private static class IndexReducer extends Reducer<NullWritable, Text, NullWritable, Text> {
@Override
protected void reduce(NullWritable key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
for (Text value : values) {
context.write(NullWritable.get(), value);
}
}
}
public static class IndexInputFormat extends LuceneIndexInputFormat<Text> {
@Override
public PathFilter getIndexDirPathFilter(Configuration conf) throws IOException {
return LuceneIndexOutputFormat.newIndexDirFilter(conf);
}
@Override
public RecordReader<IntWritable, Text>
createRecordReader(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
return new LuceneIndexCollectAllRecordReader<Text>() {
private QueryParser parser = new QueryParser(Version.LUCENE_40,
"text",
new WhitespaceAnalyzer(Version.LUCENE_40));
private Text text = new Text();
@Override
protected Query deserializeQuery(String serializedQuery) throws IOException {
try {
return parser.parse(serializedQuery);
} catch (ParseException e) {
throw new RuntimeException(e);
}
}
@Override
protected Text docToValue(Document doc) {
text.set(doc.get("text"));
return text;
}
};
}
}
private static class SearchMapper extends Mapper<IntWritable, Text, IntWritable, Text> {
@Override
protected void map(IntWritable key, Text value, Context context)
throws IOException, InterruptedException {
context.write(key, value);
}
}
private static class SearchReducer extends Reducer<IntWritable, Text, IntWritable, Text> {
@Override
protected void reduce(IntWritable key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
for (Text value : values) {
context.write(key, value);
}
}
}
@Rule
public TemporaryFolder tempDir = new TemporaryFolder();
@Test
public void testIndexing() throws Exception {
List<Path> indexes = Lists.newLinkedList();
for (Path input : INPUT_PATHS) {
Job job = new Job();
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(IndexOutputFormat.class);
job.setMapperClass(IndexMapper.class);
job.setReducerClass(IndexReducer.class);
job.setOutputValueClass(Text.class);
job.setOutputKeyClass(NullWritable.class);
TextInputFormat.setInputPaths(job, input);
Path output = new Path(
new File(tempDir.getRoot(), input.getName()).getAbsolutePath());
indexes.add(output);
IndexOutputFormat.setOutputPath(job, output);
assertTrue("Indexing of " + input + " failed!", job.waitForCompletion(true));
}
doSearch(indexes,
new Path(new File(tempDir.getRoot(), "search_results").getAbsolutePath()),
"Failed searching un-merged indexes");
File mergeIndex = new File(tempDir.getRoot(), "index-merged");
String[] args = new String[indexes.size() + 2];
args[0] = mergeIndex.getAbsolutePath();
args[1] = "100";
int i = 2;
for (Path indexPath : indexes) {
args[i++] = indexPath.toString() + "/index-0";
}
HdfsMergeTool.main(args);
doSearch(Lists.newArrayList(new Path(mergeIndex.getAbsolutePath())),
new Path(new File(tempDir.getRoot(), "merge_search_results").getAbsolutePath()),
"Failed searching merged index");
}
private void doSearch(List<Path> inputPaths, Path outputPath, String failureMessage) throws Exception {
Job job = new Job();
job.setInputFormatClass(IndexInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
job.setMapperClass(SearchMapper.class);
job.setReducerClass(SearchReducer.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
IndexInputFormat.setInputPaths(inputPaths, HadoopCompat.getConfiguration(job));
IndexInputFormat.setQueries(QUERIES, job.getConfiguration());
TextOutputFormat.setOutputPath(job, outputPath);
assertTrue(failureMessage, job.waitForCompletion(true));
File resultsFile = new File(new File(outputPath.toString()), "part-r-00000");
assertEquals(expectedResults, parseResultsFile(resultsFile));
}
public static Map<Integer, Set<String>> parseResultsFile(File file) throws IOException {
return Files.readLines(file, Charsets.UTF_8,
new LineProcessor<Map<Integer, Set<String>>>() {
private Map<Integer, Set<String>> results = Maps.newHashMap();
@Override
public boolean processLine(String s) throws IOException {
String[] parts = s.split("\t");
int query = Integer.valueOf(parts[0]);
if (results.get(query) == null) {
results.put(query, Sets.<String>newHashSet());
}
results.get(query).add(parts[1]);
return true;
}
@Override
public Map<Integer, Set<String>> getResult() {
return results;
}
});
}
}