package com.formulasearchengine.mathosphere.mlp; import com.formulasearchengine.mathosphere.mlp.cli.MachineLearningDefinienExtractionConfig; import com.formulasearchengine.mathosphere.mlp.contracts.JsonSerializerMapper; import com.formulasearchengine.mathosphere.mlp.contracts.StupidRelationScorer; import com.formulasearchengine.mathosphere.mlp.contracts.TextAnnotatorMapper; import com.formulasearchengine.mathosphere.mlp.contracts.TextExtractorMapper; import com.formulasearchengine.mathosphere.mlp.pojos.EvaluationResult; import com.formulasearchengine.mathosphere.mlp.pojos.ParsedWikiDocument; import com.formulasearchengine.mathosphere.mlp.pojos.WikiDocumentOutput; import com.formulasearchengine.mathosphere.mlp.text.SimpleFeatureExtractorMapper; import com.formulasearchengine.mlp.evaluation.Evaluator; import com.formulasearchengine.mlp.evaluation.pojo.GoldEntry; import org.apache.flink.api.java.DataSet; import org.apache.flink.api.java.ExecutionEnvironment; import org.apache.flink.api.java.operators.DataSource; import org.apache.flink.core.fs.FileSystem.WriteMode; import org.apache.log4j.Level; import org.apache.log4j.Logger; import java.io.File; import java.util.ArrayList; public class StupidRelationFinder { /** * Finds identifier - definiens pairs. Simply lists everything it finds without scoring. Also does the evaluation. * * @param config * @throws Exception */ public static void find(MachineLearningDefinienExtractionConfig config) throws Exception { ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setParallelism(config.getParallelism()); DataSource<String> source = readWikiDump(config, env); DataSet<ParsedWikiDocument> documents = source.flatMap(new TextExtractorMapper()) .map(new TextAnnotatorMapper(config)); Logger.getRootLogger().setLevel(Level.ERROR); ArrayList<GoldEntry> gold = (new Evaluator()).readGoldEntries(new File(config.getGoldFile())); DataSet<WikiDocumentOutput> instances = documents.map(new SimpleFeatureExtractorMapper(config, gold)); //get extraction results and rate all of them without selecting DataSet<EvaluationResult> result = instances.reduceGroup(new StupidRelationScorer(config)); //write to kick off flink execution result.map(new JsonSerializerMapper<>()) .writeAsText(config.getOutputDir() + "\\tmp", WriteMode.OVERWRITE); env.execute(); } public static DataSource<String> readWikiDump(MachineLearningDefinienExtractionConfig config, ExecutionEnvironment env) { return FlinkMlpRelationFinder.readWikiDump(config, env); } }