package edu.umd.cloud9.integration.example.pagerank; import static org.junit.Assert.assertTrue; import java.util.Random; import junit.framework.JUnit4TestAdapter; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.junit.Test; import tl.lin.data.pair.PairOfStrings; import com.google.common.base.Joiner; import edu.umd.cloud9.integration.IntegrationUtils; public class ClueWebPageRankIT { private static final Random random = new Random(); private static final Path collectionPath = new Path("/collections/ClueWeb09/clueweb09en01-webgraph-adjacency.txt"); private static final String tmpPrefix = "tmp-" + SimplePageRankIT.class.getCanonicalName() + "-" + random.nextInt(10000); @Test public void testPageRank() throws Exception { Configuration conf = IntegrationUtils.getBespinConfiguration(); FileSystem fs = FileSystem.get(conf); assertTrue(fs.exists(collectionPath)); String[] args; PairOfStrings pair; args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.BuildPageRankRecords.class.getCanonicalName(), "-input", collectionPath.toString(), "-output", tmpPrefix + "-clueweb09en01-PageRankRecords", "-numNodes", "50220423"}; IntegrationUtils.exec(Joiner.on(" ").join(args)); // Hash partitioning, basic IntegrationUtils.exec("hadoop fs -mkdir " + tmpPrefix + "-clueweb09en01-PageRank.hash.basic"); args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.PartitionGraph.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRankRecords", "-output", tmpPrefix + "-clueweb09en01-PageRank.hash.basic/iter0000", "-numPartitions", "200", "-numNodes", "50220423"}; IntegrationUtils.exec(Joiner.on(" ").join(args)); args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.RunPageRankBasic.class.getCanonicalName(), "-base", tmpPrefix + "-clueweb09en01-PageRank.hash.basic", "-numNodes", "50220423", "-start", "0", "-end", "10", "-useInMapperCombiner"}; IntegrationUtils.exec(Joiner.on(" ").join(args)); args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.FindMaxPageRankNodes.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRank.hash.basic/iter0010", "-output", tmpPrefix + "-clueweb09en01-PageRank.hash.basic-top10", "-top", "10"}; IntegrationUtils.exec(Joiner.on(" ").join(args)); pair = IntegrationUtils.exec("hadoop fs -cat " + tmpPrefix + "-clueweb09en01-PageRank.hash.basic-top10/part-r-00000"); assertTrue(pair.getLeftElement().contains("16073008\t-6.381")); assertTrue(pair.getLeftElement().contains("42722712\t-6.425")); assertTrue(pair.getLeftElement().contains("16073696\t-6.552")); assertTrue(pair.getLeftElement().contains("16073003\t-6.604")); assertTrue(pair.getLeftElement().contains("47345600\t-6.610")); // Hash partitioning, Schimmy IntegrationUtils.exec("hadoop fs -mkdir " + tmpPrefix + "-clueweb09en01-PageRank.hash.schimmy"); args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.PartitionGraph.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRankRecords", "-output", tmpPrefix + "-clueweb09en01-PageRank.hash.schimmy/iter0000", "-numPartitions", "200", "-numNodes", "50220423"}; IntegrationUtils.exec(Joiner.on(" ").join(args)); args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.RunPageRankSchimmy.class.getCanonicalName(), "-base", tmpPrefix + "-clueweb09en01-PageRank.hash.schimmy", "-numNodes", "50220423", "-start", "0", "-end", "10", "-useInMapperCombiner"}; IntegrationUtils.exec(Joiner.on(" ").join(args)); args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.FindMaxPageRankNodes.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRank.hash.schimmy/iter0010", "-output", tmpPrefix + "-clueweb09en01-PageRank.hash.schimmy-top10", "-top", "10"}; IntegrationUtils.exec(Joiner.on(" ").join(args)); pair = IntegrationUtils.exec("hadoop fs -cat " + tmpPrefix + "-clueweb09en01-PageRank.hash.schimmy-top10/part-r-00000"); assertTrue(pair.getLeftElement().contains("16073008\t-6.371")); assertTrue(pair.getLeftElement().contains("42722712\t-6.421")); assertTrue(pair.getLeftElement().contains("16073696\t-6.540")); assertTrue(pair.getLeftElement().contains("16073003\t-6.592")); assertTrue(pair.getLeftElement().contains("47345600\t-6.597")); // Range partitioning, basic IntegrationUtils.exec("hadoop fs -mkdir " + tmpPrefix + "-clueweb09en01-PageRank.range.basic"); args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.PartitionGraph.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRankRecords", "-output", tmpPrefix + "-clueweb09en01-PageRank.range.basic/iter0000", "-numPartitions", "200", "-numNodes", "50220423", "-range"}; IntegrationUtils.exec(Joiner.on(" ").join(args)); args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.RunPageRankBasic.class.getCanonicalName(), "-base", tmpPrefix + "-clueweb09en01-PageRank.range.basic", "-numNodes", "50220423", "-start", "0", "-end", "10", "-useInMapperCombiner", "-range"}; IntegrationUtils.exec(Joiner.on(" ").join(args)); args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.FindMaxPageRankNodes.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRank.range.basic/iter0010", "-output", tmpPrefix + "-clueweb09en01-PageRank.range.basic-top10", "-top", "10"}; IntegrationUtils.exec(Joiner.on(" ").join(args)); pair = IntegrationUtils.exec("hadoop fs -cat " + tmpPrefix + "-clueweb09en01-PageRank.range.basic-top10/part-r-00000"); assertTrue(pair.getLeftElement().contains("16073008\t-6.381")); assertTrue(pair.getLeftElement().contains("42722712\t-6.425")); assertTrue(pair.getLeftElement().contains("16073696\t-6.552")); assertTrue(pair.getLeftElement().contains("16073003\t-6.604")); assertTrue(pair.getLeftElement().contains("47345600\t-6.610")); // Range partitioning, Schimmy IntegrationUtils.exec("hadoop fs -mkdir " + tmpPrefix + "-clueweb09en01-PageRank.range.schimmy"); args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.PartitionGraph.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRankRecords", "-output", tmpPrefix + "-clueweb09en01-PageRank.range.schimmy/iter0000", "-numPartitions", "200", "-numNodes", "50220423", "-range"}; IntegrationUtils.exec(Joiner.on(" ").join(args)); args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.RunPageRankSchimmy.class.getCanonicalName(), "-base", tmpPrefix + "-clueweb09en01-PageRank.range.schimmy", "-numNodes", "50220423", "-start", "0", "-end", "10", "-useInMapperCombiner", "-range"}; IntegrationUtils.exec(Joiner.on(" ").join(args)); args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.example.pagerank.FindMaxPageRankNodes.class.getCanonicalName(), "-input", tmpPrefix + "-clueweb09en01-PageRank.range.schimmy/iter0010", "-output", tmpPrefix + "-clueweb09en01-PageRank.range.schimmy-top10", "-top", "10"}; IntegrationUtils.exec(Joiner.on(" ").join(args)); pair = IntegrationUtils.exec("hadoop fs -cat " + tmpPrefix + "-clueweb09en01-PageRank.range.schimmy-top10/part-r-00000"); assertTrue(pair.getLeftElement().contains("16073008\t-6.372")); assertTrue(pair.getLeftElement().contains("42722712\t-6.420")); assertTrue(pair.getLeftElement().contains("16073696\t-6.541")); assertTrue(pair.getLeftElement().contains("16073003\t-6.593")); assertTrue(pair.getLeftElement().contains("47345600\t-6.599")); } public static junit.framework.Test suite() { return new JUnit4TestAdapter(ClueWebPageRankIT.class); } }