package edu.umd.cloud9.integration.webgraph;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.util.Map;
import java.util.Random;
import junit.framework.JUnit4TestAdapter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile;
import org.junit.Test;
import tl.lin.data.array.ArrayListWritable;
import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import edu.umd.cloud9.integration.IntegrationUtils;
import edu.umd.cloud9.webgraph.DriverUtil;
import edu.umd.cloud9.webgraph.data.AnchorText;
import edu.umd.cloud9.webgraph.data.AnchorTextConstants;
public class ClueWeb09EN01WebgraphIT {
private static final Random rand = new Random();
private static final String tmp = "/tmp/tmp-" + ClueWeb09EN01WebgraphIT.class.getSimpleName() +
rand.nextInt(10000);
private static final String collectionPath =
"/collections/ClueWeb09/data.block/";
private static final String docnoMapping =
"/collections/ClueWeb09/docno-mapping.dat";
private static final String collectionOutput = tmp + "/webgraph-clueweb09";
// Galago: part 00000, part 00010
private ImmutableMap<Integer, String> urlMap = ImmutableMap.of(
200, "http://160.254.123.37/adr_index_performance_review.jsp",
600, "http://207.218.246.235/s/spiderman4/",
10, "http://00perdomain.com/computers/",
610, "http://207.218.246.235/s/startrek11/news/863_Tyler_Perry_Joins_Star_Trek_11_Cast.html");
// Galago: part 00000, part 00010
private ImmutableMap<Integer, ImmutableSet<Integer>> internalLinkMap = ImmutableMap.of(
200,
ImmutableSet.of(207,208,209,210,201,202,203,204,205,206),
600,
ImmutableSet.of(520,615,616,619,526,480,481,529,533,487,629,601,
585,492,591,641,596,646,506,507,602,603,604,605,
559,651,467,468),
10,
ImmutableSet.of(11,13,6),
610,
ImmutableSet.of(520,615,619,480,481,626,486,487,629,600,614,492,533,591,
640,641,548,596,646,506,507,651,605,559,467,468));
private ImmutableMap<Integer, ImmutableSet<Integer>> externalLinkMap = ImmutableMap.of(
600,
ImmutableSet.of(31937044));
@Test
public void runTests() throws Exception {
runClueDriver();
verifyWebGraph();
}
private void runClueDriver() throws Exception {
Configuration conf = IntegrationUtils.getBespinConfiguration();
FileSystem fs = FileSystem.get(conf);
assertTrue(fs.exists(new Path(collectionPath)));
fs.delete(new Path(collectionOutput), true);
String[] args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"),
edu.umd.cloud9.webgraph.driver.ClueWebDriver.class.getCanonicalName(),
"-input", collectionPath,
"-output", collectionOutput,
"-docno", docnoMapping,
"-begin", "1",
"-end", "1",
"-il",
"-normalizer", edu.umd.cloud9.webgraph.normalizer.AnchorTextBasicNormalizer.class.getCanonicalName()};
IntegrationUtils.exec(Joiner.on(" ").join(args));
}
private void verifyWebGraph() throws Exception {
Configuration conf = IntegrationUtils.getBespinConfiguration();
FileSystem fs = FileSystem.get(conf);
SequenceFile.Reader reader;
IntWritable key = new IntWritable();
ArrayListWritable<AnchorText> value = new ArrayListWritable<AnchorText>();
reader = new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(
new Path(collectionOutput + "/" + DriverUtil.OUTPUT_WEBGRAPH + "/part-00000")));
reader.next(key, value); //read key 200
verifyURLs(200, urlMap, value);
verifyLinks(200, AnchorTextConstants.Type.INTERNAL_OUT_LINK.val, internalLinkMap, value);
reader.next(key, value); //skip key 400
reader.next(key, value); //read key 600
verifyURLs(600, urlMap, value);
verifyLinks(600, AnchorTextConstants.Type.INTERNAL_OUT_LINK.val, internalLinkMap, value);
verifyLinks(600, AnchorTextConstants.Type.EXTERNAL_OUT_LINK.val, externalLinkMap, value);
reader.close();
reader = new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(
new Path(collectionOutput + "/" + DriverUtil.OUTPUT_WEBGRAPH + "/part-00010")));
reader.next(key, value); //read key 10
verifyURLs(10, urlMap, value);
verifyLinks(10, AnchorTextConstants.Type.INTERNAL_OUT_LINK.val, internalLinkMap, value);
reader.next(key, value); //skip key 210
reader.next(key, value); //skip key 410
reader.next(key, value); //read key 610
verifyURLs(610, urlMap, value);
verifyLinks(610, AnchorTextConstants.Type.INTERNAL_OUT_LINK.val, internalLinkMap, value);
reader.close();
}
private void verifyURLs(int key, Map<Integer, String> urls,
ArrayListWritable<AnchorText> value) {
for (int i = 0; i < value.size(); i++) {
if(value.get(i).isURL()) {
assertEquals(urls.get(key), value.get(i).getText());
break;
}
}
}
private void verifyLinks(int key, byte type,
Map<Integer, ImmutableSet<Integer>> links,
ArrayListWritable<AnchorText> value) {
for (int i = 0; i < value.size(); i++) {
if((value.get(i).isInternalOutLink() && type == AnchorTextConstants.Type.INTERNAL_OUT_LINK.val) ||
(value.get(i).isExternalOutLink() && type == AnchorTextConstants.Type.EXTERNAL_OUT_LINK.val)) {
int[] targets = value.get(i).getDocuments();
assertEquals(links.get(key).size(), targets.length);
for(int j = 0; j < targets.length; j++) {
assertTrue(links.get(key).contains(targets[j]));
}
}
}
}
public static junit.framework.Test suite() {
return new JUnit4TestAdapter(ClueWeb09EN01WebgraphIT.class);
}
}