package com.formulasearchengine.mathosphere.mlp.text; import com.alexeygrigorev.rseq.*; import com.formulasearchengine.mathosphere.mlp.ml.WekaUtils; import com.formulasearchengine.mathosphere.mlp.pojos.Sentence; import com.formulasearchengine.mathosphere.mlp.pojos.Word; import java.util.*; public class MachineLearningPatternMatcher { public static final String IDENTIFIER = "identifier"; public static final String DEFINITION = "definition"; public static final String OTHERMATH = "othermath"; public static final String IS = "is"; public static final String ARE = "are"; public static final String LET = "let"; public static final String BE = "be"; public static final String DENOTES = "denotes"; public static final String DENOTE = "denote"; public static final String DENOTED = "denoted"; public static final String BY = "by"; public static final Matcher<Word> isOrAre = word(IS).or(word(ARE)); public static final Matcher<Word> let = word(LET); public static final Matcher<Word> be = word(BE); public static final Matcher<Word> by = word(BY); public static final Matcher<Word> denotes = word(DENOTES).or(word(DENOTE)); public static final Matcher<Word> denoted = word(DENOTED); public static final Matcher<Word> the = pos("DT"); /** * @param sentence * @param identifierText * @param definiens * @param identifierPosition 0 indexed * @param definiensPosition 0 indexed * @return [pattern1, ... , pattern10, colon between, comma between, othermath between, definiens in parens, identifier in parens] */ public double[] match(Sentence sentence, String identifierText, String definiens, int identifierPosition, int definiensPosition) { Matcher<Word> identifier = BeanMatchers.eq(Word.class, "word", identifierText).captureAs(IDENTIFIER); Matcher<Word> definition = BeanMatchers.eq(Word.class, "word", definiens).captureAs(DEFINITION); Matcher<Word> otherMathExpression = posRegExp("(ID|MATH)").captureAs(OTHERMATH); List<Pattern<Word>> patterns = Arrays.asList( //1 not in pagel Pattern.create(identifier, definition), //2 pagel 1 Pattern.create(definition, identifier), //3 pagel 2 Pattern.create(identifier, isOrAre, definition), //4 pagel 3 Pattern.create(identifier, isOrAre, the, definition), //5 pagel 4 Pattern.create(let, identifier, be, denoted, by, definition), //6 pagel 4 Pattern.create(let, identifier, be, denoted, by, the, definition), //7 pagel 5 Pattern.create(definition, isOrAre, denoted, by, identifier), //8 pagel 5 Pattern.create(definition, isOrAre, denoted, by, the, identifier), //9 pagel 6 Pattern.create(identifier, denotes, definition), //10 pagel 6 Pattern.create(identifier, denotes, the, definition), //11 //colon Pattern.create(pos(":")), //12 //comma Pattern.create(pos(",")), //13 //othermath Pattern.create(otherMathExpression), //14 //definiens in parentheses, relative to identifier Pattern.create(word("(").or(pos("-LRB-"))), //15 //identifier in parentheses, relative to definiens Pattern.create(word(")").or(pos("-RRB-"))) ); double[] result = new double[patterns.size()]; long openingParentheses = 0; for (int i = 0; i < patterns.size(); i++) { Pattern<Word> pattern = patterns.get(i); List<Match<Word>> matches = pattern.find(sentence.getWords()); switch (i) { case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: case 8: case 9: for (Match<Word> match : matches) { //check that the positions match //if (match.matchedFrom() + identifierPattern.find(match.getMatchedSubsequence()).get(0).matchedFrom() == identifierPosition && // match.matchedFrom() + definiensPattern.find(match.getMatchedSubsequence()).get(0).matchedFrom() == definiensPosition) { Word matchedDefiniens = match.getVariable(DEFINITION); if (matchedDefiniens != null && matchedDefiniens.getWord().equals(definiens)) result[i] = 1; //} } break; case 10: case 11: case 12: for (Match<Word> match : matches) { if (inRange(match.matchedFrom(), identifierPosition, definiensPosition)) result[i] = 1; } break; case 13: openingParentheses = matches.stream().filter(m -> inRange(m.matchedFrom(), identifierPosition, definiensPosition)).count(); break; case 14: //definiens in parentheses long closingParentheses = matches.stream().filter(m -> inRange(m.matchedFrom(), identifierPosition, definiensPosition)).count(); if (identifierPosition < definiensPosition) { if (openingParentheses - closingParentheses > 0) { //definiens in parentheses result[13] = 1; } else if (openingParentheses - closingParentheses < 0) //identifier in parentheses result[14] = 1; } if (identifierPosition > definiensPosition) { if (openingParentheses - closingParentheses > 0) { //identifier in parentheses result[14] = 1; } else if (openingParentheses - closingParentheses < 0) //definiens in parentheses result[13] = 1; } } } return result; } /** * Checks if x lies between y and z * * @return */ private static boolean inRange(int x, int y, int z) { return ((y < x && x < z) || (z < x && x < y)); } protected static XMatcher<Word> word(String word) { return BeanMatchers.eq(Word.class, "word", word); } protected static XMatcher<Word> pos(String pos) { return BeanMatchers.eq(Word.class, "posTag", pos); } protected static XMatcher<Word> posRegExp(String regexp) { return BeanMatchers.regex(Word.class, "posTag", regexp); } }