package edu.umd.cloud9.integration.collection.trecweb; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.util.Random; import junit.framework.JUnit4TestAdapter; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.junit.Test; import com.google.common.base.Joiner; import edu.umd.cloud9.collection.DocnoMapping; import edu.umd.cloud9.collection.trecweb.Gov2DocnoMapping; import edu.umd.cloud9.collection.trecweb.TrecWebDocumentInputFormat; import edu.umd.cloud9.collection.trecweb.Wt10gDocnoMapping; import edu.umd.cloud9.integration.IntegrationUtils; public class IT { private static final Random random = new Random(); private static final Path wt10gPath = new Path("/collections/wt10g/data"); private static final Path gov2Path = new Path("/collections/gov2/data"); private static final String tmpPrefix = "tmp-" + IT.class.getCanonicalName() + "-" + random.nextInt(10000); // wt10g, repacked. @Test public void testWt10gDocnoMappingRepacked() throws Exception { Configuration conf = IntegrationUtils.getBespinConfiguration(); FileSystem fs = FileSystem.get(conf); assertTrue(fs.exists(wt10gPath)); String repackedCollection = tmpPrefix + "-wt10g-repacked"; String wt10gMappingFile = tmpPrefix + "-wt10g-mapping-repacked.dat"; String[] args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.trecweb.RepackTrecWebCollection.class.getCanonicalName(), "-collection=" + wt10gPath, "-output=" + repackedCollection, "-compressionType=block" }; IntegrationUtils.exec(Joiner.on(" ").join(args)); args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.trecweb.TrecWebDocnoMappingBuilder.class.getCanonicalName(), "-" + DocnoMapping.BuilderUtils.COLLECTION_OPTION + "=" + repackedCollection, "-" + DocnoMapping.BuilderUtils.MAPPING_OPTION + "=" + wt10gMappingFile }; IntegrationUtils.exec(Joiner.on(" ").join(args)); Wt10gDocnoMapping mapping = new Wt10gDocnoMapping(); mapping.loadMapping(new Path(wt10gMappingFile), fs); assertEquals("WTX001-B01-1", mapping.getDocid(1)); assertEquals("WTX062-B34-37", mapping.getDocid(1000000)); assertEquals(1, mapping.getDocno("WTX001-B01-1")); assertEquals(1000000, mapping.getDocno("WTX062-B34-37")); } // gov2, repacked. @Test public void testGov2DocnoMappingRepacked() throws Exception { Configuration conf = IntegrationUtils.getBespinConfiguration(); FileSystem fs = FileSystem.get(conf); assertTrue(fs.exists(gov2Path)); String repackedCollection = tmpPrefix + "-gov2-repacked"; String gov2MappingFile = tmpPrefix + "-gov2-mapping-repacked.dat"; String[] args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.trecweb.RepackTrecWebCollection.class.getCanonicalName(), "-collection=" + gov2Path, "-output=" + repackedCollection, "-compressionType=block" }; IntegrationUtils.exec(Joiner.on(" ").join(args)); args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.trecweb.TrecWebDocnoMappingBuilder.class.getCanonicalName(), "-" + DocnoMapping.BuilderUtils.COLLECTION_OPTION + "=" + repackedCollection, "-" + DocnoMapping.BuilderUtils.MAPPING_OPTION + "=" + gov2MappingFile }; IntegrationUtils.exec(Joiner.on(" ").join(args)); Gov2DocnoMapping mapping = new Gov2DocnoMapping(); mapping.loadMapping(new Path(gov2MappingFile), fs); assertEquals("GX000-00-0000000", mapping.getDocid(1)); assertEquals("GX210-38-0737901", mapping.getDocid(20000000)); assertEquals(1, mapping.getDocno("GX000-00-0000000")); assertEquals(20000000, mapping.getDocno("GX210-38-0737901")); } // wt10g, original. @Test public void testWt10gDocnoMapping() throws Exception { Configuration conf = IntegrationUtils.getBespinConfiguration(); FileSystem fs = FileSystem.get(conf); assertTrue(fs.exists(wt10gPath)); String wt10gMappingFile = tmpPrefix + "-wt10g-mapping.dat"; String[] args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.trecweb.TrecWebDocnoMappingBuilder.class.getCanonicalName(), "-" + DocnoMapping.BuilderUtils.COLLECTION_OPTION + "=" + wt10gPath, "-" + DocnoMapping.BuilderUtils.FORMAT_OPTION + "=" + TrecWebDocumentInputFormat.class.getCanonicalName(), "-" + DocnoMapping.BuilderUtils.MAPPING_OPTION + "=" + wt10gMappingFile }; IntegrationUtils.exec(Joiner.on(" ").join(args)); Wt10gDocnoMapping mapping = new Wt10gDocnoMapping(); mapping.loadMapping(new Path(wt10gMappingFile), fs); assertEquals("WTX001-B01-1", mapping.getDocid(1)); assertEquals("WTX062-B34-37", mapping.getDocid(1000000)); assertEquals(1, mapping.getDocno("WTX001-B01-1")); assertEquals(1000000, mapping.getDocno("WTX062-B34-37")); } // gov2, original. @Test public void testGov2DocnoMapping() throws Exception { Configuration conf = IntegrationUtils.getBespinConfiguration(); FileSystem fs = FileSystem.get(conf); assertTrue(fs.exists(gov2Path)); String gov2MappingFile = tmpPrefix + "-gov2-mapping.dat"; String[] args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.trecweb.TrecWebDocnoMappingBuilder.class.getCanonicalName(), "-" + DocnoMapping.BuilderUtils.COLLECTION_OPTION + "=" + gov2Path, "-" + DocnoMapping.BuilderUtils.FORMAT_OPTION + "=" + TrecWebDocumentInputFormat.class.getCanonicalName(), "-" + DocnoMapping.BuilderUtils.MAPPING_OPTION + "=" + gov2MappingFile }; IntegrationUtils.exec(Joiner.on(" ").join(args)); Gov2DocnoMapping mapping = new Gov2DocnoMapping(); mapping.loadMapping(new Path(gov2MappingFile), fs); assertEquals("GX000-00-0000000", mapping.getDocid(1)); assertEquals("GX210-38-0737901", mapping.getDocid(20000000)); assertEquals(1, mapping.getDocno("GX000-00-0000000")); assertEquals(20000000, mapping.getDocno("GX210-38-0737901")); } public static junit.framework.Test suite() { return new JUnit4TestAdapter(IT.class); } }