package com.formulasearchengine.mathosphere.mlp.text; import com.alexeygrigorev.rseq.*; import com.formulasearchengine.mathosphere.mlp.pojos.ParsedWikiDocument; import com.formulasearchengine.mathosphere.mlp.pojos.WikidataLink; import com.formulasearchengine.mathosphere.mlp.pojos.Word; import com.google.common.collect.Lists; import java.util.Arrays; import java.util.List; import java.util.Set; import static com.formulasearchengine.mathosphere.mlp.text.WikiTextUtils.deLinkify; public class PatternMatcher { private List<Pattern<Word>> patterns; public PatternMatcher(List<Pattern<Word>> patterns) { this.patterns = patterns; } public List<IdentifierMatch> match(List<Word> words, ParsedWikiDocument doc) { List<IdentifierMatch> result = Lists.newArrayList(); for (int i = 0; i < patterns.size(); i++) { Pattern<Word> pattern = patterns.get(i); List<Match<Word>> matches = pattern.find(words); for (Match<Word> match : matches) { String id = match.getVariable("identifier").getWord(); String def = deLinkify(match.getVariable("definition"), doc); int position = i; result.add(new IdentifierMatch(id, def, position)); } } return result; } public static PatternMatcher generatePatterns(Set<String> identifiers) { Matcher<Word> isOrAre = word("is").or(word("are")); Matcher<Word> let = word("let"); Matcher<Word> be = word("be"); Matcher<Word> by = word("by"); Matcher<Word> denotes = word("denotes").or(word("denote")); Matcher<Word> denoted = word("denoted"); Matcher<Word> the = pos("DT"); Matcher<Word> identifier = BeanMatchers.in(Word.class, "word", identifiers).captureAs("identifier"); Matcher<Word> definition = posRegExp("(NN[PS]{0,2}|NP\\+?|NN\\+|LNK)").captureAs("definition"); List<Pattern<Word>> patterns = Arrays.asList( //0 Pattern.create(definition, identifier), //1 Pattern.create(identifier, definition), //2 Pattern.create(identifier, isOrAre, definition), //3 Pattern.create(identifier, isOrAre, the, definition), //4 Pattern.create(let, identifier, be, definition), //5 Pattern.create(let, identifier, be, the, definition), //6 Pattern.create(definition, isOrAre, denoted, by, identifier), //7 Pattern.create(identifier, denotes, definition), //8 Pattern.create(identifier, denotes, the, definition) ); return new PatternMatcher(patterns); } public static XMatcher<Word> word(String word) { return BeanMatchers.eq(Word.class, "word", word); } public static XMatcher<Word> pos(String pos) { return BeanMatchers.eq(Word.class, "posTag", pos); } public static XMatcher<Word> posRegExp(String regexp) { return BeanMatchers.regex(Word.class, "posTag", regexp); } public static class IdentifierMatch { private String identifier; private String definition; private int position; public int getPosition() { return position; } /** * Constructor. * * @param identifier the identifier. * @param definition the definiens candidate. * @param position the position of the pattern which this was found. */ public IdentifierMatch(String identifier, String definition, int position) { this.identifier = identifier; this.definition = definition; this.position = position; } public String getIdentifier() { return identifier; } public String getDefinition() { return definition; } } }