package com.formulasearchengine.mathosphere.mlp.contracts;
import com.formulasearchengine.mathosphere.mlp.PatternMatchingRelationFinder;
import com.formulasearchengine.mathosphere.mlp.flink.ListCollector;
import com.formulasearchengine.mathosphere.mlp.pojos.RawWikiDocument;
import org.apache.commons.io.IOUtils;
import org.junit.Test;
import java.io.InputStream;
import java.util.List;
import static org.hamcrest.CoreMatchers.containsString;
import static org.hamcrest.CoreMatchers.not;
import static org.junit.Assert.*;
public class TextExtractorMapperTest {
@Test
public void test() throws Exception {
InputStream stream = PatternMatchingRelationFinder.class.getResourceAsStream("augmentendwikitext.xml");
String rawImput = IOUtils.toString(stream);
assertTrue(rawImput.contains("<math"));
String[] pages = rawImput.split("</page>");
TextExtractorMapper textExtractor = new TextExtractorMapper();
ListCollector<RawWikiDocument> out = new ListCollector<>();
for (String page : pages) {
textExtractor.flatMap(page, out);
}
List<RawWikiDocument> output = out.getList();
assertEquals(2, output.size());
RawWikiDocument doc1 = output.get(0);
assertEquals("Schrödinger equation", doc1.title);
assertFalse(doc1.text.contains("<math"));
assertTrue(doc1.text.contains("<math"));
RawWikiDocument doc2 = output.get(1);
assertEquals(doc2.title, "Gas constant");
}
@Test
public void testGer() throws Exception {
InputStream stream = PatternMatchingRelationFinder.class.getResourceAsStream("dewikimath-20151213130534.xml");
String rawImput = IOUtils.toString(stream);
final String expected = IOUtils.toString(PatternMatchingRelationFinder.class.getResourceAsStream("text/deText.txt"));
assertTrue(rawImput.contains("<math"));
String[] pages = rawImput.split("</page>");
TextExtractorMapper textExtractor = new TextExtractorMapper();
ListCollector<RawWikiDocument> out = new ListCollector<>();
for (String page : pages) {
textExtractor.flatMap(page, out);
}
List<RawWikiDocument> output = out.getList();
assertEquals(1, output.size());
RawWikiDocument doc1 = output.get(0);
assertEquals("Clapeyron-Gleichung", doc1.title);
assertThat(doc1.text, not(containsString("<math")));
assertThat(doc1.text, containsString("<math"));
assertEquals(expected, doc1.text);
}
}