package org.wikipedia.miner.extract; import static org.junit.Assert.*; import java.util.Vector; import javax.xml.stream.FactoryConfigurationError; import org.junit.Test; import org.wikipedia.miner.extract.model.DumpLink; import org.wikipedia.miner.extract.model.DumpLinkParser; import org.wikipedia.miner.extract.model.DumpPage; import org.wikipedia.miner.extract.util.SiteInfo; public class LinkMarkupHandling extends MarkupTestCase{ @Test public void testLinkParsing() throws FactoryConfigurationError, Exception { DumpLinkParser linkParser = new DumpLinkParser(getLangConf(), getSiteInfo()) ; DumpPage page = loadPage("april.xml"); String markup = page.getMarkup() ; Vector<int[]> linkRegions = getStripper().gatherComplexRegions(markup, "\\[\\[", "\\]\\]") ; for(int[] linkRegion: linkRegions) { String linkMarkup = markup.substring(linkRegion[0]+2, linkRegion[1]-2) ; linkParser.parseLink(linkMarkup, page.getTitle()) ; //System.out.println(linkMarkup) ; //System.out.println("ns:" + link.getTargetNamespace() + " anchor:" + link.getAnchor()) ; } } @Test public void testLinkVariants() throws Exception { DumpLinkParser linkParser = new DumpLinkParser(getLangConf(), getSiteInfo()) ; DumpLink link = linkParser.parseLink("Cambodia", "Thailand") ; assertEquals(link.getTargetNamespace().getKey(), SiteInfo.MAIN_KEY) ; assertEquals(link.getTargetTitle(), "Cambodia") ; assertEquals(link.getAnchor(), "Cambodia") ; link = linkParser.parseLink("Cambodia#Population|Cambodia's Population", "Thailand") ; assertEquals(link.getTargetNamespace().getKey(), SiteInfo.MAIN_KEY) ; assertEquals(link.getTargetTitle(), "Cambodia") ; assertEquals(link.getTargetSection(), "Population") ; assertEquals(link.getAnchor(), "Cambodia's Population") ; link = linkParser.parseLink("#Population|Cambodia's Population", "Cambodia") ; assertEquals(link.getTargetNamespace().getKey(), SiteInfo.MAIN_KEY) ; assertEquals(link.getTargetTitle(), "Cambodia") ; assertEquals(link.getTargetSection(), "Population") ; assertEquals(link.getAnchor(), "Cambodia's Population") ; link = linkParser.parseLink("Category:Cambodia| ", "Thailand") ; assertEquals(link.getTargetNamespace().getKey(), SiteInfo.CATEGORY_KEY) ; assertEquals(link.getTargetTitle(), "Cambodia") ; assertEquals(link.getAnchor(), "Cambodia") ; link = linkParser.parseLink("Image:8denarii.jpg|thumb|400px|row 1 : 157 BC [[Roman Republic]], 73 AD [[Vespasian]], 161 AD [[Marcus Aurelius]], 194 AD [[Septimius Severus]];\n" + "row 2: 199 AD [[Caracalla]], 200 AD [[Julia Domna]], 219 AD [[Elagabalus]], 236 AD [[Maximinus Thrax]].", "Denarii") ; assertEquals(link.getTargetNamespace().getKey(), SiteInfo.FILE_KEY) ; assertEquals(link.getTargetTitle(), "8denarii.jpg") ; link = linkParser.parseLink("Hellenic Parliament-MPs swearing in.png|thumb|left|The Greek [[parliament]] is in [[Athens]].", "Greece") ; assertEquals(link.getTargetNamespace().getKey(), SiteInfo.FILE_KEY) ; assertEquals(link.getTargetTitle(), "Hellenic Parliament-MPs swearing in.png") ; } }