package com.twitter.elephantbird.pig;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.List;
import java.util.Map;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.pig.ExecType;
import org.apache.pig.Expression;
import org.apache.pig.LoadMetadata;
import org.apache.pig.PigServer;
import org.apache.pig.ResourceSchema;
import org.apache.pig.ResourceStatistics;
import org.apache.pig.backend.executionengine.ExecJob;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.util.Utils;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import com.twitter.elephantbird.mapreduce.LuceneIndexingIntegrationTest;
import com.twitter.elephantbird.mapreduce.input.LuceneIndexInputFormat;
import com.twitter.elephantbird.pig.load.LuceneIndexLoader;
import com.twitter.elephantbird.pig.store.LuceneIndexStorage;
import static org.junit.Assert.assertEquals;
/**
* End-to-end test of
* {@link com.twitter.elephantbird.pig.store.LuceneIndexStorage} and {@link LuceneIndexLoader}
* <p>
* Similar to {@link LuceneIndexingIntegrationTest}
* <ol>
* <li>Builds three indexes of small documents (text from the Iliad and Macbeth)</li>
* <li>Searches the indexes using queries supplied as string literals</li>
* <li>Verifies that the correct results are found</li>
* <li>Searches the indexes using queries stored in a file</li>
* <li>Verifies that the correct results are found</li>
* </ol>
*
* @author Alex Levenson
*/
public class PigLuceneIndexingIntegrationTest {
public static class IndexOutputFormat extends LuceneIndexStorage.PigLuceneIndexOutputFormat {
private LuceneIndexingIntegrationTest.IndexOutputFormat delegate =
new LuceneIndexingIntegrationTest.IndexOutputFormat();
private static final Text text = new Text();
@Override
protected Document buildDocument(NullWritable key, Tuple value) throws IOException {
text.set((String) value.get(0));
return delegate.buildDocument(key, text);
}
@Override
protected Analyzer newAnalyzer(Configuration conf) {
return delegate.newAnalyzer(conf);
}
}
public static class Loader extends LuceneIndexLoader<Text> implements LoadMetadata {
private static final TupleFactory TF = TupleFactory.getInstance();
public Loader(String[] args) {
super(args);
}
@Override
protected Tuple recordToTuple(int key, Text value) {
return TF.newTuple(ImmutableList.of(key, value.toString()));
}
@Override
protected LuceneIndexInputFormat<Text> getLuceneIndexInputFormat() throws IOException {
return new LuceneIndexingIntegrationTest.IndexInputFormat();
}
@Override
public ResourceSchema getSchema(String location, Job job) throws IOException {
return new ResourceSchema(Utils.getSchemaFromString("queryId:int, text:chararray"));
}
@Override
public ResourceStatistics getStatistics(String s, Job job) throws IOException {
return null;
}
@Override
public String[] getPartitionKeys(String s, Job job) throws IOException {
return new String[0];
}
@Override
public void setPartitionFilter(Expression expression) throws IOException {
}
}
@Rule
public TemporaryFolder tempDir = new TemporaryFolder();
@Test
public void testIndexing() throws Exception {
for (Path input : LuceneIndexingIntegrationTest.INPUT_PATHS) {
// the input files are in the lucene module's resources but the
// cwd is the pig-lucene module's directory
input = new Path(new File("../lucene/" + input.toString()).getAbsolutePath());
PigServer pigServer = new PigServer(ExecType.LOCAL);
pigServer.setBatchOn();
Map<String, String> params = ImmutableMap.of(
"INPUT", input.toString(),
"OUTPUT", new File(tempDir.getRoot(), input.getName()).getAbsolutePath()
);
pigServer.registerScript(
"src/test/resources/com/twitter/elephantbird/pig/index.pig",
params);
runPigScript(pigServer, "Indexing of " + input + " failed!");
}
File resultsQueries = new File(tempDir.getRoot(), "results_queries");
File resultsFile = new File(tempDir.getRoot(), "results_file");
File queryFile = tempDir.newFile("queryfile.txt");
Map<String, String> paramsQueries = ImmutableMap.of(
"INPUT", tempDir.getRoot().getAbsolutePath(),
"OUTPUT", resultsQueries.getAbsolutePath()
);
Map<String, String> paramsFile = ImmutableMap.of(
"INPUT", tempDir.getRoot().getAbsolutePath(),
"OUTPUT", resultsFile.getAbsolutePath(),
"QUERY_FILE", queryFile.getAbsolutePath()
);
PigServer pigServer = new PigServer(ExecType.LOCAL);
pigServer.setBatchOn();
pigServer.registerScript(
"src/test/resources/com/twitter/elephantbird/pig/search_queries.pig",
paramsQueries);
runPigScript(pigServer, "Searching via string literal queries failed!");
assertEquals(LuceneIndexingIntegrationTest.expectedResults,
LuceneIndexingIntegrationTest.parseResultsFile(new File(resultsQueries, "part-m-00000")));
// write the queries to a file to test loading queries from a file
Writer out = new OutputStreamWriter(new FileOutputStream(queryFile));
for (String query : LuceneIndexingIntegrationTest.QUERIES) {
out.write(query);
out.write("\n");
}
out.close();
pigServer = new PigServer(ExecType.LOCAL);
pigServer.setBatchOn();
pigServer.registerScript(
"src/test/resources/com/twitter/elephantbird/pig/search_file.pig",
paramsFile);
runPigScript(pigServer, "Searching via queries loaded from a file failed!");
assertEquals(LuceneIndexingIntegrationTest.expectedResults,
LuceneIndexingIntegrationTest.parseResultsFile(new File(resultsFile, "part-m-00000")));
}
private void runPigScript(PigServer server, String failureMessage) throws IOException {
List<ExecJob> jobs = server.executeBatch();
for (ExecJob job : jobs) {
if (job.getStatus() != ExecJob.JOB_STATUS.COMPLETED) {
throw new RuntimeException(failureMessage);
}
}
}
}