package com.formulasearchengine.mathosphere.mlp.text; import com.formulasearchengine.mathosphere.mlp.PatternMatchingRelationFinder; import com.formulasearchengine.mathosphere.mlp.cli.FlinkMlpCommandConfig; import com.formulasearchengine.mathosphere.mlp.pojos.MathTag; import com.formulasearchengine.mathosphere.mlp.pojos.Sentence; import com.formulasearchengine.mathosphere.mlp.pojos.Word; import org.apache.commons.io.IOUtils; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import static org.junit.Assert.assertEquals; public class PosTaggerTest { private static final Logger LOGGER = LoggerFactory.getLogger(PosTaggerTest.class); @Test public void annotation() throws Exception { FlinkMlpCommandConfig cfg = FlinkMlpCommandConfig.test(); PosTagger nlpProcessor = PosTagger.create(cfg); String text = readText("escaped.txt"); List<MathTag> mathTags = WikiTextUtils.findMathTags(text); String newText = WikiTextUtils.replaceAllFormulas(text, mathTags); String cleanText = WikiTextUtils.extractPlainText(newText); List<Sentence> result = nlpProcessor.process(cleanText, mathTags); List<Word> expected = Arrays.asList(w("where", "WRB"), w("Ψ", "ID"), w("is", "VBZ"), w("the", "DT"), w("wave function", "LNK"), w("of", "IN"), w("the", "DT"), w("quantum system", "NN+"), w(",", ","), w("i", "FW"), w("is", "VBZ"), w("the", "DT"), w("imaginary unit", "LNK"), w(",", ","), w("ħ", "NN"), w("is", "VBZ"), w("the", "DT"), w("reduced Planck constant", "LNK")); List<Word> sentence = result.get(0).getWords(); assertEquals(expected, sentence.subList(0, expected.size())); LOGGER.debug("full result: {}", result); } @Test public void joinLinks_withLinks() { List<Word> in = Arrays.asList(w("Since", "IN"), w("``", "``"), w("energy", "NN"), w("''", "''"), w("and", "CC"), w("``", "``"), w("momentum", "NN"), w("''", "''"), w("are", "VBP"), w("related", "VBN")); List<Word> expected = Arrays.asList(w("Since", "IN"), w("energy", PosTag.LINK), w("and", "CC"), w("momentum", PosTag.LINK), w("are", "VBP"), w("related", "VBN")); List<Word> actual = PosTagger.concatenateLinks(in, new HashSet<String>()); assertEquals(expected, actual); } @Test public void joinLinks_noLinks() { List<Word> in = Arrays.asList(w("Since", "IN"), w("energy", "NN"), w("and", "CC"), w("momentum", "NN"), w("are", "VBP"), w("related", "VBN")); List<Word> expected = in; List<Word> actual = PosTagger.concatenateLinks(in, new HashSet<String>()); assertEquals(expected, actual); } @Test public void concatenate_inside() { List<Word> in = Arrays.asList(w("Since", "IN"), w("energy", "NN"), w("momentum", "NN"), w("related", "VBN")); List<Word> expected = Arrays.asList(w("Since", "IN"), w("energy momentum", "NN+"), w("related", "VBN")); List<Word> actual = PosTagger.concatenateSuccessiveNounsToNounSequence(in); assertEquals(expected, actual); } @Test public void concatenate_noSucc() { List<Word> in = Arrays.asList(w("Since", "IN"), w("energy", "NN"), w("and", "CC"), w("momentum", "NN"), w("are", "VBP"), w("related", "VBN")); List<Word> expected = in; List<Word> actual = PosTagger.concatenateSuccessiveNounsToNounSequence(in); assertEquals(expected, actual); } @Test public void concatenateJJtoNP() { List<Word> in = Arrays.asList(w("to", "TO"), w("be", "VB"), w("the", "DT"), w("same", "JJ"), w("type", "NN")); List<Word> expected = Arrays.asList(w("to", "TO"), w("be", "VB"), w("the", "DT"), w("same type", "NP")); List<Word> actual = PosTagger.contatenateSuccessive2Tags(in, "JJ", "NN", "NP"); assertEquals(expected, actual); } @Test public void concatenateJJtoNP_notFollowed() { List<Word> in = Arrays.asList(w("be", "VB"), w("the", "DT"), w("same", "JJ"), w("to", "TO")); List<Word> expected = in; List<Word> actual = PosTagger.contatenateSuccessive2Tags(in, "JJ", "NN", "NP"); assertEquals(expected, actual); } @Test public void concatenateJJtoNP_JPLast() { List<Word> in = Arrays.asList(w("to", "TO"), w("be", "VB"), w("the", "DT"), w("same", "JJ")); List<Word> expected = in; List<Word> actual = PosTagger.contatenateSuccessive2Tags(in, "JJ", "NN", "NP"); assertEquals(expected, actual); } public static Word w(String word, String tag) { return new Word(word, tag); } public static String readText(String name) throws IOException { InputStream inputStream = PatternMatchingRelationFinder.class.getResourceAsStream(name); return IOUtils.toString(inputStream); } }