package com.formulasearchengine.mathosphere.mlp.text; import com.formulasearchengine.mathosphere.mlp.PatternMatchingRelationFinder; import com.formulasearchengine.mathosphere.mlp.cli.FlinkMlpCommandConfig; import com.formulasearchengine.mathosphere.mlp.pojos.MathTag; import com.formulasearchengine.mathosphere.mlp.pojos.Sentence; import com.formulasearchengine.mathosphere.mlp.pojos.Word; import org.apache.commons.io.IOUtils; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.List; import static org.junit.Assert.assertEquals; public class PosTaggerTestGer { private static final Logger LOGGER = LoggerFactory.getLogger(PosTaggerTestGer.class); /** * other models "edu/stanford/nlp/models/pos-tagger/german/german-fast.tagger", * "edu/stanford/nlp/models/pos-tagger/german/german-fast-caseless.tagger", * "edu/stanford/nlp/models/pos-tagger/german/german-hgc.tagger" */ private static final String GER = "edu/stanford/nlp/models/pos-tagger/german/german-fast.tagger"; // other models @Test public void simpleGermanTest() throws Exception { FlinkMlpCommandConfig cfg = FlinkMlpCommandConfig.test(); cfg.setModel(GER); PosTagger nlpProcessor = PosTagger.create(cfg); String text = "Dies ist ein simpler Beispieltext."; List<MathTag> mathTags = WikiTextUtils.findMathTags(text); String newText = WikiTextUtils.replaceAllFormulas(text, mathTags); String cleanText = WikiTextUtils.extractPlainText(newText); List<Sentence> result = nlpProcessor.process(cleanText, mathTags); List<Word> expected = Arrays.asList(w("Dies", "PDS"), w("ist", "VAFIN"), w("ein", "ART"), w("simpler", "ADJA"), w("Beispieltext", "NN"), w(".", "$.")); List<Word> sentence = result.get(0).getWords(); assertEquals(expected, sentence.subList(0, expected.size())); LOGGER.debug("full result: {}", result); } @Test public void mediumGermanTest() throws Exception { final String text = IOUtils.toString(PosTaggerTest.class.getResourceAsStream("deText.txt")); FlinkMlpCommandConfig cfg = FlinkMlpCommandConfig.test(); cfg.setModel(GER); PosTagger nlpProcessor = PosTagger.create(cfg); List<MathTag> mathTags = WikiTextUtils.findMathTags(text); String newText = WikiTextUtils.replaceAllFormulas(text, mathTags); long t0 = System.nanoTime(); String cleanText = WikiTextUtils.extractPlainText(newText); System.out.println((System.nanoTime() - t0) / 1000000 + "ms for cleaning."); List<Sentence> result = nlpProcessor.process(cleanText, mathTags); } public static Word w(String word, String tag) { return new Word(word, tag); } public static String readText(String name) throws IOException { InputStream inputStream = PatternMatchingRelationFinder.class.getResourceAsStream(name); return IOUtils.toString(inputStream); } }