package com.formulasearchengine.mathosphere.mlp;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.formulasearchengine.mathosphere.mlp.cli.MachineLearningDefinienClassifierConfig;
import com.formulasearchengine.mathosphere.mlp.cli.MachineLearningDefinienExtractionConfig;
import com.formulasearchengine.mathosphere.mlp.contracts.JsonSerializerMapper;
import com.formulasearchengine.mathosphere.mlp.contracts.StupidRelationScorer;
import com.formulasearchengine.mathosphere.mlp.contracts.TextAnnotatorMapper;
import com.formulasearchengine.mathosphere.mlp.contracts.TextExtractorMapper;
import com.formulasearchengine.mathosphere.mlp.pojos.EvaluationResult;
import com.formulasearchengine.mathosphere.mlp.ml.WekaClassifier;
import com.formulasearchengine.mathosphere.mlp.pojos.ParsedWikiDocument;
import com.formulasearchengine.mathosphere.mlp.pojos.Relation;
import com.formulasearchengine.mathosphere.mlp.pojos.StrippedWikiDocumentOutput;
import com.formulasearchengine.mathosphere.mlp.pojos.WikiDocumentOutput;
import com.formulasearchengine.mathosphere.mlp.text.SimpleFeatureExtractorMapper;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.core.fs.FileSystem;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by Leo on 10.02.2017.
*/
public class MachineLearningRelationClassifier {
private static Map<String, Object> ndData;
public static void find(MachineLearningDefinienClassifierConfig config) throws Exception {
//parse wikipedia (subset) and process afterwards
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(config.getParallelism());
DataSource<String> source = readWikiDump(config, env);
DataSet<ParsedWikiDocument> documents = source.flatMap(new TextExtractorMapper())
.map(new TextAnnotatorMapper(config));
Logger.getRootLogger().setLevel(Level.ERROR);
DataSet<WikiDocumentOutput> instances = documents.map(new SimpleFeatureExtractorMapper(config, null));
//process parsed wikipedia
DataSet<WikiDocumentOutput> result = instances.map(new WekaClassifier(config));
ObjectMapper mapper = new ObjectMapper();
if (config.getNamespace()) {
File ndFile = new File(config.getNdFile());
List ndList = mapper.readValue(ndFile, List.class);
ndData = new HashMap<>();
for (Object o : ndList) {
final Map entry = (Map) o;
ndData.put(((String) entry.get("document_title")).replaceAll(" ", "_"), o);
}
}
DataSet<WikiDocumentOutput> withNamespaces = result.map(new MapFunction<WikiDocumentOutput, WikiDocumentOutput>() {
@Override
public WikiDocumentOutput map(WikiDocumentOutput wikiDocumentOutput) throws Exception {
if (config.getNamespace()) {
wikiDocumentOutput.setRelations(new ArrayList<>());
final Map nd = (Map) ndData.get(wikiDocumentOutput.getTitle().replaceAll("\\s", "_"));
if (nd != null) {
List relNS = (List) nd.get("namespace_relations");
if (relNS != null)
for (Object o : relNS) {
Relation rel = new Relation(o);
wikiDocumentOutput.getRelations().add(rel);
}
}
}
return wikiDocumentOutput;
}
});
if (config.isEvaluate()) {
String[] args = {
"-in", config.getDataset(),
"-out", config.getOutputDir(),
"--goldFile", config.getQueries(),
"--threads", "1",
"--tex",
};
MachineLearningDefinienExtractionConfig evaluationConfig = MachineLearningDefinienExtractionConfig.from(args);
DataSet<EvaluationResult> evaluationResult = withNamespaces.reduceGroup(new StupidRelationScorer(evaluationConfig));
evaluationResult.map(new JsonSerializerMapper<>()).writeAsText(config.getOutputDir() + "/extractedDefiniens/evaluated", FileSystem.WriteMode.OVERWRITE);
}
DataSet<StrippedWikiDocumentOutput> stripped_result = withNamespaces.map(stripSentenceMapper);
//write and kick off flink execution
stripped_result.map(new JsonSerializerMapper<>())
.writeAsText(config.getOutputDir() + "/extractedDefiniens", FileSystem.WriteMode.OVERWRITE);
env.execute();
}
public static DataSource<String> readWikiDump(MachineLearningDefinienClassifierConfig config, ExecutionEnvironment
env) {
return FlinkMlpRelationFinder.readWikiDump(config, env);
}
private static MapFunction<WikiDocumentOutput, StrippedWikiDocumentOutput> stripSentenceMapper =
(MapFunction<WikiDocumentOutput, StrippedWikiDocumentOutput>) wikiDocumentOutput ->
new StrippedWikiDocumentOutput(wikiDocumentOutput);
}