package com.formulasearchengine.mathosphere.mlp.contracts;
import com.google.common.base.Throwables;
import com.formulasearchengine.mathosphere.mlp.cli.FlinkMlpCommandConfig;
import com.formulasearchengine.mathosphere.mlp.flink.ListCollector;
import com.formulasearchengine.mathosphere.mlp.pojos.*;
import com.formulasearchengine.mathosphere.mlp.text.PosTag;
import com.formulasearchengine.mathosphere.mlp.text.WikiTextUtils;
import com.formulasearchengine.mathosphere.mlp.text.WikiTextUtilsTest;
import org.junit.Test;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
import java.util.Set;
import static org.junit.Assert.*;
public class TextAnnotatorMapperTest {
private static final Random RND = new Random();
public static final TextAnnotatorMapper TEST_INSTANCE = createTestInstance();
@Test
public void readRecentPlainWikiDump() throws Exception {
List<RawWikiDocument> docs = readWikiTextDocuments("com/formulasearchengine/mathosphere/mlp/mrrFullHist.xml");
assertEquals(1, docs.size());
}
@Test
public void test() throws Exception {
final String mathMLExtract = WikiTextUtilsTest.getTestResource("com/formulasearchengine/mathosphere/mlp/schrödinger_eq.xml").trim();
List<RawWikiDocument> docs = readWikiTextDocuments("com/formulasearchengine/mathosphere/mlp/augmentendwikitext.xml");
RawWikiDocument schroedingerIn = docs.get(0);
assertTrue("the seed math tag was not found", schroedingerIn.text.contains(mathMLExtract));
MathTag tag = new MathTag(0, mathMLExtract, WikiTextUtils.MathMarkUpType.MATHML);
String placeholder = tag.placeholder();
ParsedWikiDocument shroedingerOut = TEST_INSTANCE.map(schroedingerIn);
Set<String> identifiers = shroedingerOut.getIdentifiers().elementSet();
assertTrue(identifiers.containsAll(Arrays.asList("Ψ", "V", "h", "λ", "ρ", "τ")));
List<MathTag> formulas = shroedingerOut.getFormulas();
MathTag formula = null;
for (MathTag f : formulas) {
if (placeholder.equals(f.getKey())) {
formula = f;
break;
}
}
//@TODO: reactivate tests
assertNotNull("the placeholder was not found", formula);
assertTrue("the placeholder was not part of the sentence", contains(formula, shroedingerOut.getSentences()));
}
private static boolean contains(MathTag formula, List<Sentence> sentences) {
Word mathWord = new Word(formula.getKey(), PosTag.MATH);
for (Sentence sentence : sentences) {
List<Word> words = sentence.getWords();
if (words.contains(mathWord)) {
return true;
}
}
return false;
}
public static MathTag randomElement(List<MathTag> formulas) {
int idx = RND.nextInt(formulas.size());
return formulas.get(idx);
}
public static List<RawWikiDocument> readWikiTextDocuments(String testFile) throws Exception {
String rawImput = WikiTextUtilsTest.getTestResource(testFile);
String[] pages = rawImput.split("</page>");
TextExtractorMapper textExtractor = new TextExtractorMapper();
ListCollector<RawWikiDocument> out = new ListCollector<>();
for (String page : pages) {
textExtractor.flatMap(page, out);
}
return out.getList();
}
private static TextAnnotatorMapper createTestInstance() {
try {
TextAnnotatorMapper textAnnotator = new TextAnnotatorMapper(FlinkMlpCommandConfig.test());
textAnnotator.open(null);
return textAnnotator;
} catch (Exception e) {
throw Throwables.propagate(e);
}
}
@Test
public void tokenization_formulaSuffexed() throws Exception {
String text = "The <math>x</math>-axis shows...";
RawWikiDocument doc = new RawWikiDocument("some doc", 1, text);
ParsedWikiDocument result = TEST_INSTANCE.map(doc);
List<MathTag> formulas = result.getFormulas();
assertEquals(1, formulas.size());
Sentence sentence = result.getSentences().get(0);
List<Word> expected = Arrays.asList(new Word("The", "DT"), new Word("x", "ID"), new Word("-axis",
"-SUF"), new Word("shows", "VBZ"), new Word("...", ":"));
assertEquals(expected, sentence.getWords());
}
}