package edu.umd.cloud9.integration.collection.trec; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.util.Random; import junit.framework.JUnit4TestAdapter; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.util.LineReader; import org.junit.Test; import com.google.common.base.Joiner; import edu.umd.cloud9.collection.DocnoMapping; import edu.umd.cloud9.collection.trec.TrecDocnoMapping; import edu.umd.cloud9.collection.trec.TrecForwardIndex; import edu.umd.cloud9.integration.IntegrationUtils; public class IT { private static final Random random = new Random(); private static final Path collectionPath = new Path("/collections/trec/trec4-5_noCRFR.xml"); private static final String tmpPrefix = "tmp-" + IT.class.getCanonicalName() + "-" + random.nextInt(10000); private static final String mappingFile = tmpPrefix + "-mapping.dat"; @Test public void runTests() throws Exception { testDocnoMapping(); testDemoCountDocs(); testForwardIndex(); } private void testDocnoMapping() throws Exception { Configuration conf = IntegrationUtils.getBespinConfiguration(); FileSystem fs = FileSystem.get(conf); assertTrue(fs.exists(collectionPath)); String[] args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.trec.TrecDocnoMappingBuilder.class.getCanonicalName(), "-" + DocnoMapping.BuilderUtils.COLLECTION_OPTION + "=" + collectionPath, "-" + DocnoMapping.BuilderUtils.MAPPING_OPTION + "=" + mappingFile }; IntegrationUtils.exec(Joiner.on(" ").join(args)); TrecDocnoMapping mapping = new TrecDocnoMapping(); mapping.loadMapping(new Path(mappingFile), fs); assertEquals("FBIS3-1", mapping.getDocid(1)); assertEquals("LA061490-0139", mapping.getDocid(400000)); assertEquals(1, mapping.getDocno("FBIS3-1")); assertEquals(400000, mapping.getDocno("LA061490-0139")); } private void testDemoCountDocs() throws Exception { Configuration conf = IntegrationUtils.getBespinConfiguration(); FileSystem fs = FileSystem.get(conf); assertTrue(fs.exists(collectionPath)); String output = tmpPrefix + "-cnt"; String records = tmpPrefix + "-records.txt"; String[] args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.trec.CountTrecDocuments.class.getCanonicalName(), "-collection=" + collectionPath, "-output=" + output, "-docnoMapping=" + mappingFile, "-countOutput=" + records}; IntegrationUtils.exec(Joiner.on(" ").join(args)); LineReader reader = new LineReader(fs.open(new Path(records))); Text str = new Text(); reader.readLine(str); reader.close(); assertEquals(472525, Integer.parseInt(str.toString())); } private void testForwardIndex() throws Exception { Configuration conf = IntegrationUtils.getBespinConfiguration(); FileSystem fs = FileSystem.get(conf); assertTrue(fs.exists(collectionPath)); String index = tmpPrefix + "-findex.dat"; String[] args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.trec.TrecForwardIndexBuilder.class.getCanonicalName(), "-collection=" + collectionPath, "-index=" + index, "-docnoMapping=" + mappingFile }; IntegrationUtils.exec(Joiner.on(" ").join(args)); TrecForwardIndex findex = new TrecForwardIndex(); findex.loadIndex(new Path(index), new Path(mappingFile), fs); assertTrue(findex.getDocument(1).getContent().contains("Newspapers in the Former Yugoslav Republic")); assertTrue(findex.getDocument("FBIS3-1").getContent().contains("Newspapers in the Former Yugoslav Republic")); assertEquals(1, findex.getFirstDocno()); assertEquals(472525, findex.getLastDocno()); } public static junit.framework.Test suite() { return new JUnit4TestAdapter(IT.class); } }