package com.formulasearchengine.mathosphere.mlp.contracts;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import com.formulasearchengine.mathosphere.mlp.cli.BaseConfig;
import com.formulasearchengine.mathosphere.mlp.pojos.MathTag;
import com.formulasearchengine.mathosphere.mlp.pojos.ParsedWikiDocument;
import com.formulasearchengine.mathosphere.mlp.pojos.RawWikiDocument;
import com.formulasearchengine.mathosphere.mlp.pojos.Sentence;
import com.formulasearchengine.mathosphere.mlp.pojos.WikidataLink;
import com.formulasearchengine.mathosphere.mlp.text.MathConverter;
import com.formulasearchengine.mathosphere.mlp.text.PosTagger;
import com.formulasearchengine.mathosphere.mlp.text.WikiTextUtils;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.configuration.Configuration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
public class TextAnnotatorMapper extends RichMapFunction<RawWikiDocument, ParsedWikiDocument> {
private static final Logger LOGGER = LoggerFactory.getLogger(TextAnnotatorMapper.class);
private final BaseConfig config;
private PosTagger posTagger;
public TextAnnotatorMapper(BaseConfig config) {
this.config = config;
}
@Override
public void open(Configuration cfg) throws Exception {
posTagger = PosTagger.create(config);
}
@Override
public ParsedWikiDocument map(RawWikiDocument doc) throws Exception {
LOGGER.info("processing \"{}\"...", doc.title);
final ParsedWikiDocument parse = parse(doc.text, doc.title);
LOGGER.debug("identifiers in \"{}\" from {} formulas: {}", doc.title, parse.getFormulas().size(),
parse.getIdentifiers());
return parse;
}
public ParsedWikiDocument parse(String wikitext, String title) {
List<Sentence> sentences;
List<WikidataLink> links = null;
List<MathTag> mathTags;
try {
String cleanText;
if (config.getUseTeXIdentifiers()) {
MathConverter c = new MathConverter(wikitext, title, config);
cleanText = c.getStrippedOutput();
mathTags = c.getMathTags();
links = c.getLinks();
} else {
mathTags = WikiTextUtils.findMathTags(wikitext);
String newText = WikiTextUtils.replaceAllFormulas(wikitext, mathTags);
cleanText = WikiTextUtils.extractPlainText(newText);
}
//formulas = toFormulas(mathTags, config.getUseTeXIdentifiers(),config.getTexvcinfoUrl());
sentences = posTagger.process(cleanText, mathTags);
} catch (Exception e) {
LOGGER.warn("Problem with text processing", title, e);
mathTags = new ArrayList<>();
sentences = new ArrayList<>();
}
Multiset<String> allIdentifiers = HashMultiset.create();
for (MathTag formula : mathTags) {
for (Multiset.Entry<String> entry : formula.getIdentifiers(config).entrySet()) {
allIdentifiers.add(entry.getElement(), entry.getCount());
}
}
return new ParsedWikiDocument(title, allIdentifiers, mathTags, sentences, links);
}
public ParsedWikiDocument parse(String wikitext) {
return parse(wikitext, "no title specified");
}
}