package com.formulasearchengine.mathosphere.mlp.text; import com.alexeygrigorev.rseq.*; import com.formulasearchengine.mathosphere.mlp.pojos.*; import java.util.*; /** * Simply extracts whole sentences with identifiers and definitions. */ public class SimplePatternMatcher { public static final String IDENTIFIER = "identifier"; public static final String DEFINITION = "definition"; private List<Pattern<Word>> patterns; private SimplePatternMatcher(List<Pattern<Word>> patterns) { this.patterns = patterns; } /** * Find all identifier-definiens candidates in the document. * * @param sentence The sentence to check. * @param doc The document that contains the sentence. * @return identifier-definiens candidates, boxed in relations. */ public Collection<Relation> match(Sentence sentence, ParsedWikiDocument doc) { List<Relation> result = new ArrayList<>(); List<Match<Word>> identifierMatches = patterns.get(0).find(sentence.getWords()); if (identifierMatches.size() > 0) { List<Match<Word>> definiensMatches = patterns.get(1).find(sentence.getWords()); if (definiensMatches.size() > 0) { for (Match<Word> identifier : identifierMatches) { for (Match<Word> definiens : definiensMatches) { if (definiens.getVariable(DEFINITION).getWord().length() >= 3) { Relation relation = new Relation(); relation.setIdentifier(identifier.getVariable(IDENTIFIER).getWord()); relation.setDefinition(definiens.getVariable(DEFINITION), doc); //replace the definiens in the sentence with the word as it is stored in the relation. Word definiensWord = sentence.getWords().remove(definiens.matchedFrom()); Word cleanDefiniensWord = new Word(relation.getDefinition(), definiensWord.getPosTag()); sentence.getWords().add(definiens.matchedFrom(), cleanDefiniensWord); relation.setSentence(sentence); relation.setIdentifierPosition(identifier.matchedFrom()); relation.setWordPosition(definiens.matchedFrom()); result.add(relation); } } } } } return result; } public static SimplePatternMatcher generatePatterns(Set<String> identifiers) { Matcher<Word> identifier = BeanMatchers.in(Word.class, "word", identifiers).captureAs(IDENTIFIER); Matcher<Word> definition = posRegExp("(NN[PS]{0,2}|NP\\+?|NN\\+|LNK)").captureAs(DEFINITION); List<Pattern<Word>> patterns = Arrays.asList( Pattern.create(identifier), Pattern.create(definition) ); return new SimplePatternMatcher(patterns); } protected static XMatcher<Word> posRegExp(String regexp) { return BeanMatchers.regex(Word.class, "posTag", regexp); } }