package edu.umd.cloud9.integration.collection.wikipedia; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.util.Map; import java.util.Random; import junit.framework.JUnit4TestAdapter; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.junit.Test; import com.google.common.base.Joiner; import com.google.common.collect.ImmutableSet; import edu.umd.cloud9.collection.wikipedia.WikipediaDocnoMappingBuilder; import edu.umd.cloud9.integration.IntegrationUtils; public class WikipediaBfsIT { private static final Random random = new Random(); private static final String tmpPrefix = "tmp-" + WikipediaBfsIT.class.getCanonicalName() + "-" + random.nextInt(10000); @Test public void tesBfs() throws Exception { String input = "/collections/wikipedia/enwiki-20121201-pages-articles"; Configuration conf = IntegrationUtils.getBespinConfiguration(); FileSystem fs = FileSystem.get(conf); assertTrue(fs.exists(new Path(input))); String mappingFile = tmpPrefix + "-enwiki-mapping.dat"; Map<String, Integer> values; // Build the mapping. String[] args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.wikipedia.WikipediaDocnoMappingBuilder.class.getCanonicalName(), "-" + WikipediaDocnoMappingBuilder.INPUT_OPTION + "=" + input, "-" + WikipediaDocnoMappingBuilder.OUTPUT_FILE_OPTION + "=" + mappingFile, "-keep_all" }; values = IntegrationUtils.execKeyValueExtractor(Joiner.on(" ").join(args), ImmutableSet.of("TOTAL")); assertEquals(12961996, (int) values.get("TOTAL")); // Repack the wiki. String repackedWiki = tmpPrefix + "-enwiki.block"; args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.wikipedia.RepackWikipedia.class.getCanonicalName(), "-input=" + input, "-mapping_file=" + mappingFile, "-output=" + repackedWiki, "-compression_type=block", }; values = IntegrationUtils.execKeyValueExtractor(Joiner.on(" ").join(args), ImmutableSet.of("TOTAL")); assertEquals(12961996, (int) values.get("TOTAL")); // Extract the link graph. String wikiEdges = tmpPrefix + "-enwiki.edges"; String wikiAdj = tmpPrefix + "-enwiki.adj"; args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.wikipedia.graph.ExtractWikipediaLinkGraph.class.getCanonicalName(), "-input=" + repackedWiki, "-edges_output=" + wikiEdges, "-adjacency_list_output=" + wikiAdj, "-num_partitions=10" }; values = IntegrationUtils.execKeyValueExtractor(Joiner.on(" ").join(args), ImmutableSet.of("EDGES", "TOTAL_VERTICES", "VERTICES_WITH_OUTLINKS")); assertEquals(121762273, (int) values.get("EDGES")); assertEquals(12961996, (int) values.get("TOTAL_VERTICES")); assertEquals(10813673, (int) values.get("VERTICES_WITH_OUTLINKS")); // Build Bfs records. String bfsBase = tmpPrefix + "-enwiki.bfs"; args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.bfs.EncodeBfsGraph.class.getCanonicalName(), "-input=" + wikiAdj, "-output=" + bfsBase + "/iter0000", "-src=12" }; IntegrationUtils.exec(Joiner.on(" ").join(args)); // Iteration 1. args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.bfs.IterateBfs.class.getCanonicalName(), "-input=" + bfsBase + "/iter0000", "-output=" + bfsBase + "/iter0001", "-num_partitions=10" }; values = IntegrationUtils.execKeyValueExtractor(Joiner.on(" ").join(args), ImmutableSet.of("ReachableInReducer")); assertEquals(573, (int) values.get("ReachableInReducer")); // Iteration 2. args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.bfs.IterateBfs.class.getCanonicalName(), "-input=" + bfsBase + "/iter0001", "-output=" + bfsBase + "/iter0002", "-num_partitions=10" }; values = IntegrationUtils.execKeyValueExtractor(Joiner.on(" ").join(args), ImmutableSet.of("ReachableInReducer")); assertEquals(37733, (int) values.get("ReachableInReducer")); // Iteration 3. args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.bfs.IterateBfs.class.getCanonicalName(), "-input=" + bfsBase + "/iter0002", "-output=" + bfsBase + "/iter0003", "-num_partitions=10" }; values = IntegrationUtils.execKeyValueExtractor(Joiner.on(" ").join(args), ImmutableSet.of("ReachableInReducer")); assertEquals(845452, (int) values.get("ReachableInReducer")); // Iteration 4. args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.bfs.IterateBfs.class.getCanonicalName(), "-input=" + bfsBase + "/iter0003", "-output=" + bfsBase + "/iter0004", "-num_partitions=10" }; values = IntegrationUtils.execKeyValueExtractor(Joiner.on(" ").join(args), ImmutableSet.of("ReachableInReducer")); assertEquals(3596247, (int) values.get("ReachableInReducer")); // Iteration 5. args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.bfs.IterateBfs.class.getCanonicalName(), "-input=" + bfsBase + "/iter0004", "-output=" + bfsBase + "/iter0005", "-num_partitions=10" }; values = IntegrationUtils.execKeyValueExtractor(Joiner.on(" ").join(args), ImmutableSet.of("ReachableInReducer")); assertEquals(5236564, (int) values.get("ReachableInReducer")); } public static junit.framework.Test suite() { return new JUnit4TestAdapter(WikipediaBfsIT.class); } }