package edu.umd.cloud9.integration.collection.wikipedia; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.util.List; import java.util.Random; import junit.framework.JUnit4TestAdapter; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.junit.Test; import com.google.common.base.Joiner; import edu.umd.cloud9.collection.wikipedia.WikipediaDocnoMapping; import edu.umd.cloud9.collection.wikipedia.WikipediaDocnoMappingBuilder; import edu.umd.cloud9.integration.IntegrationUtils; public class WikipediaBasicIT { private static final Random random = new Random(); private static final String tmpPrefix = "tmp-" + WikipediaBasicIT.class.getCanonicalName() + "-" + random.nextInt(10000); public void testWikiDocnoMapping(String language, String input, String docid1, String docid2, int numDisamb, int numArticles, int total) throws Exception { Configuration conf = IntegrationUtils.getBespinConfiguration(); FileSystem fs = FileSystem.get(conf); assertTrue(fs.exists(new Path(input))); String mappingFile = tmpPrefix + "-" + language + "wiki-mapping.dat"; String[] args = new String[] { "hadoop jar", IntegrationUtils.getJar("target", "cloud9"), edu.umd.cloud9.collection.wikipedia.WikipediaDocnoMappingBuilder.class.getCanonicalName(), "-" + WikipediaDocnoMappingBuilder.INPUT_OPTION + "=" + input, "-" + WikipediaDocnoMappingBuilder.OUTPUT_FILE_OPTION + "=" + mappingFile, "-" + WikipediaDocnoMappingBuilder.LANGUAGE_OPTION + "=" + language }; List<Integer> counts = IntegrationUtils.execWiki(Joiner.on(" ").join(args)); WikipediaDocnoMapping mapping = new WikipediaDocnoMapping(); mapping.loadMapping(new Path(mappingFile), fs); System.out.println("DISAMBIGUATION = " + numDisamb + "; ARTICLE = " + numArticles + "; TOTAL = " + total); System.out.println("DOCNO 0 = " + mapping.getDocid(1)); System.out.println("DOCNO 100000 = " + mapping.getDocid(100000)); // docno to docid assertEquals(docid1, mapping.getDocid(1)); assertEquals(docid2, mapping.getDocid(100000)); // docid to docno assertEquals(1, mapping.getDocno(docid1)); assertEquals(100000, mapping.getDocno(docid2)); // # of disamb pages assertEquals(numDisamb, (int) counts.get(0)); // # of articles assertEquals(numArticles, (int) counts.get(1)); // total # assertEquals(total, (int) counts.get(2)); } @Test public void testAllWikis() throws Exception { testWikiDocnoMapping("en", "/collections/wikipedia/enwiki-20121201-pages-articles", "12", "189362", 123666, 4033137, 12961996); testWikiDocnoMapping("cs", "/collections/wikipedia/cswiki-20121215-pages-articles.xml", "4", "344433", 7800, 248999, 497398); testWikiDocnoMapping("de", "/collections/wikipedia/dewiki-20121215-pages-articles.xml", "1", "297141", 174678, 1326111, 3001626); testWikiDocnoMapping("es", "/collections/wikipedia/eswiki-20121130-pages-articles.xml", "7", "358642", 36669, 1092193, 2611748); testWikiDocnoMapping("ar", "/collections/wikipedia/arwiki-20121218-pages-articles.xml", "7", "572997", 3789, 237860, 529641); testWikiDocnoMapping("zh", "/collections/wikipedia/zhwiki-20121210-pages-articles.xml", "13", "456258", 17992, 602267, 2067973); testWikiDocnoMapping("tr", "/collections/wikipedia/trwiki-20121217-pages-articles.xml", "5", "432151", 5938, 240952, 589118); } public static junit.framework.Test suite() { return new JUnit4TestAdapter(WikipediaBasicIT.class); } }