package org.wikibrain.sr.wikify;
import org.junit.Test;
import org.wikibrain.core.lang.Language;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import static org.junit.Assert.*;
/**
* @author Shilad Sen
*/
public class TestWBCorpusLineIterable {
static final File PATH = new File("src/test/resources/test_corpus.txt.bz2");
@Test
public void testBasic() {
WbCorpusLineReader reader = new WbCorpusLineReader(PATH);
int numLines = 0;
List<WbCorpusLineReader.DocInfo> docs = new ArrayList<WbCorpusLineReader.DocInfo>();
for (WbCorpusLineReader.Line line : reader) {
assertEquals(line.getCorpus().getLanguage(), Language.SIMPLE);
if (!docs.contains(line.getDoc())) { docs.add(line.getDoc()); }
numLines += 1;
if (numLines == 1) assertEquals(line.getLine(), "Category Asian countries");
if (numLines == 3) assertEquals(line.getLine(), "It is near Land's_End:/w/simple/-1/Unknown_page");
if (numLines == 850) assertTrue(line.getLine().startsWith("The same sort of laws:/w/simple/426/Law can"));
}
assertEquals(50, docs.size());
assertEquals(850, numLines);
}
@Test
public void testDoc() {
WBCorpusDocReader reader = new WBCorpusDocReader(PATH);
List<WBCorpusDocReader.Doc> docs = new ArrayList<WBCorpusDocReader.Doc>();
List<String> lines = new ArrayList<String>();
for (WBCorpusDocReader.Doc doc: reader) {
assertEquals(doc.getCorpus().getLanguage(), Language.SIMPLE);
docs.add(doc);
lines.addAll(doc.getLines());
}
assertEquals(9983, docs.get(0).getDoc().getId());
assertEquals(9464, docs.get(1).getDoc().getId());
assertEquals(9596, docs.get(49).getDoc().getId());
assertEquals(50, docs.size());
assertEquals(850, lines.size());
assertEquals(lines.get(0), "Category Asian countries");
assertEquals(lines.get(2), "It is near Land's_End:/w/simple/-1/Unknown_page");
assertTrue(lines.get(849).startsWith("The same sort of laws:/w/simple/426/Law can"));
}
}