package org.wikipedia.miner.extract;
import static org.junit.Assert.assertEquals;
import java.io.File;
import java.io.IOException;
import java.util.List;
import javax.xml.stream.FactoryConfigurationError;
import javax.xml.stream.XMLStreamException;
import opennlp.tools.util.InvalidFormatException;
import org.junit.Before;
import org.junit.Test;
import org.wikipedia.miner.extract.model.DumpPage;
import org.wikipedia.miner.extract.util.PageSentenceExtractor;
public class TestMarkupHandling extends MarkupTestCase {
private PageSentenceExtractor sentenceExtractor ;
@Test
public void testParsing() throws XMLStreamException, FactoryConfigurationError, IOException {
DumpPage page = loadPage("autonomousCommunitiesOfSpain.xml");
assertEquals(page.getId(), 12) ;
assertEquals(page.getTitle(), "Autonomous communities of Spain") ;
assertEquals(page.getNamespace().getKey(), 0) ;
}
@Test
public void testSentenceExtraction() throws XMLStreamException, IOException {
DumpPage page = loadPage("autonomousCommunitiesOfSpain.xml");
String markup = page.getMarkup() ;
String strippedMarkup = getStripper().stripAllButInternalLinksAndEmphasis(markup, ' ') ;
assertEquals(markup.length(), strippedMarkup.length()) ;
List<Integer> sentenceSplits = sentenceExtractor.getSentenceSplits(page) ;
//System.out.println(StringUtils.join(sentenceSplits, ",")) ;
assertEquals(sentenceSplits.size(), 34) ;
/*
int lastSplit = 0 ;
for (int split : sentenceSplits) {
System.out.println("s: " + markup.substring(lastSplit, split)) ;
lastSplit = split ;
}
*/
}
@Before
public void init() throws FactoryConfigurationError, Exception {
super.init();
sentenceExtractor = loadSentenceExtractor() ;
}
private PageSentenceExtractor loadSentenceExtractor() throws InvalidFormatException, IOException {
File sentenceModelFile = new File("../models/en-sent.bin") ;
return new PageSentenceExtractor(sentenceModelFile) ;
}
}