package uk.bl.wa.hadoop.indexer.mdx;
import static org.junit.Assert.assertEquals;
import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mrunit.MapDriver;
import org.apache.hadoop.mrunit.MapReduceDriver;
import org.apache.hadoop.mrunit.ReduceDriver;
import org.apache.hadoop.mrunit.types.Pair;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.ArchiveRecord;
import org.json.JSONException;
import org.junit.Before;
import org.junit.Test;
import com.typesafe.config.Config;
import com.typesafe.config.ConfigFactory;
import com.typesafe.config.ConfigRenderOptions;
import uk.bl.wa.hadoop.WritableArchiveRecord;
import uk.bl.wa.hadoop.mapreduce.mdx.MDX;
import uk.bl.wa.hadoop.mapreduce.mdx.MDXReduplicatingReducer;
public class WARCMDXMapperTest {
private static final Log LOG = LogFactory.getLog(WARCMDXMapperTest.class);
MapDriver<Text, WritableArchiveRecord, Text, Text> mapDriver;
ReduceDriver<Text, Text, Text, Text> reduceDriver;
MapReduceDriver<Text, WritableArchiveRecord, Text, Text, Text, Text> mapReduceDriver;
@Before
public void setUp() {
// Overload the config:
Configuration conf = new Configuration();
Config c = ConfigFactory.load("mdx");
conf.set(WARCMDXGenerator.CONFIG_PROPERTIES, c.withOnlyPath("warc")
.root().render(ConfigRenderOptions.concise()));
// Set up the mapper etc.:
WARCMDXMapper mapper = new WARCMDXMapper();
MDXReduplicatingReducer reducer = new MDXReduplicatingReducer();
mapDriver = MapDriver.newMapDriver(mapper).withConfiguration(conf);
reduceDriver = ReduceDriver.newReduceDriver(reducer);
mapReduceDriver = MapReduceDriver.newMapReduceDriver();
}
@Test
public void testMapper() throws IOException, JSONException {
Set<String> skippableRecords = new HashSet<String>();
skippableRecords.add("application/warc-fields");
skippableRecords.add("text/dns");
File inputFile = new File(
"../warc-indexer/src/test/resources/gov.uk-revisit-warcs/BL-20140325121225068-00000-32090~opera~8443.warc.gz");
String archiveName = inputFile.getName();
ArchiveReader reader = ArchiveReaderFactory.get(inputFile);
Iterator<ArchiveRecord> ir = reader.iterator();
ArchiveRecord record;
Text key = new Text();
WritableArchiveRecord value = new WritableArchiveRecord();
while (ir.hasNext()) {
record = (ArchiveRecord) ir.next();
key.set(archiveName);
value.setRecord(record);
LOG.info("GOT: " + record.getHeader().getRecordIdentifier());
LOG.info("GOT: " + record.getHeader().getMimetype());
// Skip records that can't be analysed:
if (skippableRecords.contains(record.getHeader()
.getMimetype()))
continue;
// Run through them all:
LOG.info("Running without testing output...");
mapDriver.setInput(key, value);
List<Pair<Text, Text>> result = mapDriver.run();
if (result != null && result.size() > 0) {
MDX mdx = new MDX(result.get(0).getSecond().toString());
LOG.info("RESULT MDX: " + mdx);
// Perform a specific check for one of the items:
if ("http://data.gov.uk/".equals(record.getHeader().getUrl())
&& record.getHeader().getMimetype()
.contains("response")) {
Text testKey = new Text(
"sha1:SKAVWVVB6HYPSTY3YNQJVM2C4FZRWBSG");
MDX testMdx = new MDX(
"{\"digest\":\"sha1:SKAVWVVB6HYPSTY3YNQJVM2C4FZRWBSG\",\"url\":\"http://data.gov.uk/\",\"timestamp\":\"20140325121238\"}");
assertEquals(testKey, result.get(0).getFirst());
assertEquals(testMdx.getUrl(), mdx.getUrl());
assertEquals(testMdx.getHash(), mdx.getHash());
assertEquals(testMdx.getTs(), mdx.getTs());
}
}
mapDriver.resetOutput();
}
}
}