package uk.bl.wa.hadoop.indexer.mdx;
import static org.junit.Assert.assertEquals;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MiniMRCluster;
import org.apache.hadoop.mapred.OutputLogFilter;
import org.codehaus.plexus.util.IOUtil;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import uk.bl.wa.hadoop.mapreduce.mdx.MDX;
import uk.bl.wa.hadoop.mapreduce.mdx.MDXSeqMerger;
public class WARCMDXGeneratorIntegrationTest {
private static final Log log = LogFactory
.getLog(WARCMDXGeneratorIntegrationTest.class);
// Test cluster:
private MiniDFSCluster dfsCluster = null;
private MiniMRCluster mrCluster = null;
// Input files:
public final static String[] testWarcs = new String[] {
"gov.uk-revisit-warcs/BL-20140325121225068-00000-32090~opera~8443.warc.gz",
"gov.uk-revisit-warcs/BL-20140325122341434-00000-32090~opera~8443.warc.gz" };
private final Path input = new Path("inputs");
private final Path output = new Path("outputs");
private final Path outputMerged = new Path("outputs-merged");
// Exported results
public static File outputSeq = new File("target/test.seq");
public static File outputMergedSeq = new File("target/test-merged.seq");
@Before
public void setUp() throws Exception {
// Print out the full config for debugging purposes:
// Config index_conf = ConfigFactory.load();
// LOG.debug(index_conf.root().render());
log.warn("Spinning up test cluster...");
// make sure the log folder exists,
// otherwise the test fill fail
new File("target/test-logs").mkdirs();
//
System.setProperty("hadoop.log.dir", "target/test-logs");
System.setProperty("javax.xml.parsers.SAXParserFactory",
"com.sun.org.apache.xerces.internal.jaxp.SAXParserFactoryImpl");
//
Configuration conf = new Configuration();
System.setProperty("test.build.data",
new File("target/mini-dfs").getAbsolutePath());
dfsCluster = new MiniDFSCluster(conf, 1, true, null);
dfsCluster.getFileSystem().makeQualified(input);
dfsCluster.getFileSystem().makeQualified(output);
//
mrCluster = new MiniMRCluster(1, getFileSystem().getUri().toString(), 1);
// prepare for tests
for (String filename : testWarcs) {
copyFileToTestCluster(getFileSystem(), input,
"../warc-indexer/src/test/resources/", filename);
}
log.warn("Spun up test cluster.");
}
protected FileSystem getFileSystem() throws IOException {
return dfsCluster.getFileSystem();
}
public static void copyFileToTestCluster(FileSystem fs, Path input,
String prefix, String filename)
throws IOException {
Path targetPath = new Path(input, filename);
File sourceFile = new File(prefix + filename);
log.info("Copying " + filename + " into cluster at "
+ targetPath.toUri() + "...");
FSDataOutputStream os = fs.create(targetPath);
InputStream is = new FileInputStream(sourceFile);
IOUtils.copy(is, os);
is.close();
os.close();
log.info("Copy completed.");
}
public static File writeInputFile(Path[] inputFiles) throws Exception {
// Make a list:
File tmpInputsFile = File.createTempFile("inputs", ".txt");
tmpInputsFile.deleteOnExit();
Writer s = new FileWriter(tmpInputsFile);
for (Path p : inputFiles) {
s.write(p.toString() + "\n");
}
s.close();
return tmpInputsFile;
}
@SuppressWarnings("deprecation")
@Test
public void testMDXGenerator() throws Exception {
// prepare for test
// createTextInputFile();
log.info("Checking input file is present...");
// Check that the input file is present:
Path[] inputFiles = FileUtil.stat2Paths(getFileSystem()
.listStatus(new Path(input, "gov.uk-revisit-warcs/"),
new OutputLogFilter()));
Assert.assertEquals(2, inputFiles.length);
// Create a file of the inputs
File tmpInputsFile = writeInputFile(inputFiles);
// Set up arguments for the job:
String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o",
this.output.getName() };
// Set up the WARCIndexerRunner
WARCMDXGenerator wir = new WARCMDXGenerator();
// run job
// Job configuration:
log.info("Setting up job config...");
JobConf jobConf = this.mrCluster.createJobConf();
jobConf.setInt(WARCMDXGenerator.WARC_HADOOP_NUM_REDUCERS, 1);
wir.createJobConf(jobConf, args);
log.info("Running job...");
JobClient.runJob(jobConf);
log.info("Job finished, checking the results...");
// check the output exists
Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(
output, new OutputLogFilter()));
// Default is 1 reducers (as knitting together multiple sequence files
// is not a mere matter of concatentation):
Assert.assertEquals(1, outputFiles.length);
// Copy the output out of HDFS and onto local FS:
FileOutputStream fout = new FileOutputStream(outputSeq);
for (Path output : outputFiles) {
log.info(" --- output : " + output);
if (getFileSystem().isFile(output)) {
InputStream is = getFileSystem().open(output);
IOUtil.copy(is, fout);
} else {
log.info(" --- ...skipping directory...");
}
fout.flush();
}
fout.close();
// Check contents of the output:
Configuration config = new Configuration();
Path path = new Path(outputSeq.getAbsolutePath());
SequenceFile.Reader reader = new SequenceFile.Reader(
FileSystem.get(config), path, config);
WritableComparable key = (WritableComparable) reader.getKeyClass()
.newInstance();
Writable value = (Writable) reader.getValueClass().newInstance();
MDX mdx;
int counter = 0;
while (reader.next(key, value)) {
mdx = new MDX(value.toString());
System.out.println("Key is: " + key + " record_type: "
+ mdx.getRecordType() + " SURT: " + mdx.getUrlAsSURT());
counter++;
}
assertEquals(114, counter);
reader.close();
// Now test the MDXSeqMerger
testSeqMerger(outputFiles);
}
private void testSeqMerger(Path[] inputFiles) throws Exception {
// Create a file of the inputs
File tmpInputsFile = writeInputFile(inputFiles);
// Set up arguments for the job:
String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o",
this.outputMerged.getName(), "-r", "1" };
// Set up the WARCIndexerRunner
MDXSeqMerger msm = new MDXSeqMerger();
// run job
log.info("Setting up job config...");
JobConf jobConf = this.mrCluster.createJobConf();
msm.createJobConf(jobConf, args);
log.info("Running job...");
JobClient.runJob(jobConf);
log.info("Job finished, checking the results...");
// Copy the output out of HDFS and onto local FS:
FileOutputStream fout = new FileOutputStream(outputMergedSeq);
Path[] outputFiles = FileUtil.stat2Paths(getFileSystem()
.listStatus(outputMerged, new OutputLogFilter()));
for (Path output : outputFiles) {
log.info(" --- output : " + output);
if (getFileSystem().isFile(output)) {
InputStream is = getFileSystem().open(output);
IOUtil.copy(is, fout);
} else {
log.info(" --- ...skipping directory...");
}
fout.flush();
}
fout.close();
}
@After
public void tearDown() throws Exception {
log.warn("Tearing down test cluster...");
if (dfsCluster != null) {
dfsCluster.shutdown();
dfsCluster = null;
}
if (mrCluster != null) {
mrCluster.shutdown();
mrCluster = null;
}
log.warn("Torn down test cluster.");
}
}