package com.formulasearchengine.mathosphere.mlp.contracts; import com.formulasearchengine.mathosphere.mlp.pojos.ParsedWikiDocument; import com.formulasearchengine.mathosphere.mlp.pojos.Relation; import com.formulasearchengine.mathosphere.mlp.pojos.Sentence; import com.formulasearchengine.mathosphere.mlp.pojos.WikiDocumentOutput; import com.formulasearchengine.mathosphere.mlp.text.DefinitionUtils; import com.formulasearchengine.mathosphere.mlp.text.PatternMatcher; import com.formulasearchengine.mathosphere.mlp.text.PatternMatcher.IdentifierMatch; import com.google.common.collect.Lists; import org.apache.flink.api.common.functions.MapFunction; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.List; import java.util.Set; public class PatternMatcherMapper implements MapFunction<ParsedWikiDocument, WikiDocumentOutput> { private static final Logger LOGGER = LoggerFactory.getLogger(PatternMatcherMapper.class); @Override public WikiDocumentOutput map(ParsedWikiDocument doc) throws Exception { List<Relation> foundRelations = Lists.newArrayList(); List<Sentence> sentences = doc.getSentences(); for (Sentence sentence : sentences) { if (!sentence.getIdentifiers().isEmpty()) { LOGGER.debug("sentence {}", sentence); } Set<String> identifiers = sentence.getIdentifiers(); PatternMatcher matcher = PatternMatcher.generatePatterns(identifiers); List<IdentifierMatch> foundMatches = matcher.match(sentence.getWords(), doc); for (IdentifierMatch match : foundMatches) { if (!DefinitionUtils.isValid(match.getDefinition())) { continue; } Relation relation = new Relation(); relation.setIdentifier(match.getIdentifier()); relation.setDefinition(match.getDefinition()); relation.setSentence(sentence); relation.setScore(1.0d); relation.setIdentifierPosition(match.getPosition()); if (!relationWasFoundBefore(foundRelations, relation)) { LOGGER.debug("found match {}", relation); foundRelations.add(relation); } } } LOGGER.info("extracted {} relations from {}", foundRelations.size(), doc.getTitle()); return new WikiDocumentOutput(doc.getTitle(), foundRelations, doc.getIdentifiers()); } private boolean relationWasFoundBefore(List<Relation> foundRelations, Relation relation) { return foundRelations.stream().filter( e -> e.getIdentifier().toLowerCase().equals(relation.getIdentifier().toLowerCase()) && e.getDefinition().toLowerCase().equals(relation.getDefinition().toLowerCase()) ).findAny().isPresent(); } }