package com.formulasearchengine.mathosphere.mlp.contracts;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Lists;
import com.google.common.collect.Multiset;
import com.google.common.collect.Multiset.Entry;
import com.formulasearchengine.mathosphere.mlp.cli.BaseConfig;
import com.formulasearchengine.mathosphere.mlp.pojos.ParsedWikiDocument;
import com.formulasearchengine.mathosphere.mlp.pojos.Relation;
import com.formulasearchengine.mathosphere.mlp.pojos.Sentence;
import com.formulasearchengine.mathosphere.mlp.pojos.WikiDocumentOutput;
import com.formulasearchengine.mathosphere.mlp.pojos.Word;
import org.apache.flink.api.common.functions.MapFunction;
import java.util.*;
/**
* Mapper that finds a list of possible identifiers and their definitions. As described in section 2 step 4 of
* https://www.google.co.jp/url?sa=t&rct=j&q=&esrc=s&source=web&cd=4&cad=rja&uact=8&ved=0ahUKEwjbo8bF5J3PAhWMcT4KHesdCRMQFgg0MAM&url=https%3A%2F%2Fwww.gipp.com%2Fwp-content%2Fpapercite-data%2Fpdf%2Fschubotz16.pdf&usg=AFQjCNG8WcokDbLBSdzddbijH-bJh4w5sA&sig2=ofIftBvBlsOdwikq2d1fag
*/
public class CreateCandidatesMapper implements MapFunction<ParsedWikiDocument, WikiDocumentOutput> {
private final BaseConfig config;
private double alpha;
private double beta;
private double gamma;
public CreateCandidatesMapper(BaseConfig config) {
this.config = config;
//copy alpha, beta and gamma for convince
this.alpha = config.getAlpha();
this.beta = config.getBeta();
this.gamma = config.getGamma();
}
@Override
public WikiDocumentOutput map(ParsedWikiDocument doc) {
Set<String> identifiers = doc.getIdentifiers().elementSet();
List<Relation> relations = Lists.newArrayList();
for (String identifier : identifiers) {
List<Relation> candidates = generateCandidates(doc, identifier);
if(config.getDefinitionMerging()){
selfMerge(candidates);
} else {
Collections.sort(candidates);
Collections.reverse(candidates);
}
for (Relation rel : candidates) {
if (rel.getScore() >= config.getThreshold()) {
relations.add(rel);
}
}
}
return new WikiDocumentOutput(doc.getTitle(), relations, doc.getIdentifiers());
}
private void selfMerge(List<Relation> candidates) {
Collections.sort(candidates,Relation::compareNameScore);
final Iterator<Relation> iterator = candidates.iterator();
Relation lastLower = null;
Relation lastElement = null;
double decayFactor;
int multiplicity = 1;
while (iterator.hasNext()) {
final Relation relation = iterator.next();
Relation lower = new Relation(relation.getIdentifier(), relation.getDefinition().toLowerCase());
if (lastLower != null && lower.compareToName(lastLower) == 0) {
multiplicity++;
decayFactor = Math.pow(2, -1.3*multiplicity);
lastElement.setScore(lastElement.getScore() + relation.getScore() * decayFactor);
iterator.remove();
} else {
multiplicity = 1;
relation.setScore(.722 * relation.getScore());
lastElement = relation;
lastLower = lower;
}
}
candidates.sort(Relation::compareTo);
}
/**
* Find a list of possible definitions for an identifier. As described in section 2 step 4 of
* https://www.google.co.jp/url?sa=t&rct=j&q=&esrc=s&source=web&cd=4&cad=rja&uact=8&ved=0ahUKEwjbo8bF5J3PAhWMcT4KHesdCRMQFgg0MAM&url=https%3A%2F%2Fwww.gipp.com%2Fwp-content%2Fpapercite-data%2Fpdf%2Fschubotz16.pdf&usg=AFQjCNG8WcokDbLBSdzddbijH-bJh4w5sA&sig2=ofIftBvBlsOdwikq2d1fag
*
* @param doc Where to search for definitions
* @param identifier What to define.
* @return {@link List<Relation>} with ranked definitions for the identifier.
*/
private List<Relation> generateCandidates(ParsedWikiDocument doc, String identifier) {
List<Sentence> sentences = findSentencesWithIdentifier(doc.getSentences(), identifier);
if (sentences.isEmpty()) {
return Collections.emptyList();
}
List<Relation> result = Lists.newArrayList();
Multiset<String> frequencies = calcFrequencies(sentences);
if (frequencies.isEmpty()) {
return Collections.emptyList();
}
int maxFrequency = calculateMax(frequencies);
for (int sentenceIdx = 0; sentenceIdx < sentences.size(); sentenceIdx++) {
Sentence sentence = sentences.get(sentenceIdx);
List<Word> words = sentence.getWords();
List<Integer> positions = identifierPositions(words, identifier);
for (int wordIdx = 0; wordIdx < words.size(); wordIdx++) {
//Definiendum
Word word = words.get(wordIdx);
if (!isGood(word)) {
continue;
}
int identifierPosition = closestIdentifierPosition(positions, wordIdx);
int distance = Math.abs(identifierPosition - wordIdx);
int freq = frequencies.count(word.toLowerCase());
double score = calculateScore(distance, freq, maxFrequency, sentenceIdx);
Relation relation = new Relation();
relation.setIdentifier(identifier);
relation.setIdentifierPosition(identifierPosition);
relation.setDefinition(word, doc);
relation.setWordPosition(wordIdx);
relation.setScore(score);
relation.setSentence(sentence);
// relation.setSentence(sentence);
result.add(relation);
}
}
return result;
}
/**
* Find a list of possible definitions for an identifier. As described in section 2 step 5 of
* https://www.google.co.jp/url?sa=t&rct=j&q=&esrc=s&source=web&cd=4&cad=rja&uact=8&ved=0ahUKEwjbo8bF5J3PAhWMcT4KHesdCRMQFgg0MAM&url=https%3A%2F%2Fwww.gipp.com%2Fwp-content%2Fpapercite-data%2Fpdf%2Fschubotz16.pdf&usg=AFQjCNG8WcokDbLBSdzddbijH-bJh4w5sA&sig2=ofIftBvBlsOdwikq2d1fag
*
* @param distance Number of tokens between identifier and definiens.
* @param frequency The term frequency of the possible definiendum.
* @param maxFrequency The max term frequency within this document. For normalisation of the term frequency.
* @param sentenceIdx The number of sentences between the definiens candidate and the sentence in which the identifier occurs for the first time
* @return Score how likely the definiendum is the correct definition for the identifier.
*/
private double calculateScore(int distance, int frequency, int maxFrequency, int sentenceIdx) {
double std1 = Math.sqrt(Math.pow(5d, 2d) / (2d * Math.log(2)));
double dist = gaussian(distance, std1);
double std2 = Math.sqrt(Math.pow(3d, 2d) / (2d * Math.log(2)));
double seq = gaussian(sentenceIdx, std2);
double relativeFrequency = (double) frequency / (double) maxFrequency;
return (alpha * dist + beta * seq + gamma * relativeFrequency) / (alpha + beta + gamma);
}
private static double gaussian(double x, double std) {
return Math.exp(-x * x / (2 * std * std));
}
/**
* Find all occurrences of the identifier in the sentence.
*/
public static List<Integer> identifierPositions(List<Word> sentence, String identifier) {
List<Integer> result = Lists.newArrayList();
for (int wordIdx = 0; wordIdx < sentence.size(); wordIdx++) {
Word word = sentence.get(wordIdx);
if (Objects.equals(identifier, word.getWord())) {
result.add(wordIdx);
}
}
return result;
}
public static int closestIdentifierPosition(List<Integer> positions, int wordIdx) {
if (positions.isEmpty()) {
return -1;
}
Iterator<Integer> it = positions.iterator();
int bestPos = it.next();
int bestDist = Math.abs(wordIdx - bestPos);
while (it.hasNext()) {
int pos = it.next();
int dist = Math.abs(wordIdx - pos);
if (dist < bestDist) {
bestDist = dist;
bestPos = pos;
}
}
return bestPos;
}
public static int calculateMax(Multiset<String> frequencies) {
Entry<String> max = Collections.max(frequencies.entrySet(),
(e1, e2) -> Integer.compare(e1.getCount(), e2.getCount()));
return max.getCount();
}
private Multiset<String> calcFrequencies(List<Sentence> sentences) {
Multiset<String> counts = HashMultiset.create();
for (Sentence sentence : sentences) {
for (Word word : sentence.getWords()) {
if (isGood(word)) {
counts.add(word.getWord().toLowerCase());
}
}
}
return counts;
}
private boolean isGood(Word in) {
String word = in.getWord();
String posTag = in.getPosTag();
/* if (!DefinitionUtils.isValid(word)) {
return false;
}
if ("ID".equals(posTag)) {
return false;
}*/
if (word.length() < 3) {
return false;
}
if (word.contains("<")) {
// remove tags and so
//TODO: Make a white-list of allowed chars.
return false;
}
// we're only interested in nouns, entities and links
return posTag.matches("NN[PS]{0,2}|NP\\+?|NN\\+|LNK");
}
/**
* Find all sentences with the given identifier.
*
* @return {@link ArrayList} with the sentences containing the identifier.
*/
public static List<Sentence> findSentencesWithIdentifier(List<Sentence> sentences, String identifier) {
List<Sentence> result = Lists.newArrayList();
for (Sentence sentence : sentences) {
if (sentence.contains(identifier)) {
result.add(sentence);
}
}
return result;
}
}