package edu.stanford.nlp.naturalli; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher; import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern; import java.util.*; /** * <p> * Translate a question to a statement. For example, "where was Obama born?" to "Obama was born in ?". * </p> * * <p> * This class was developed for, and therefore likely performs best on (read: "over-fits gloriously to") * the webquestions dataset (http://www-nlp.stanford.edu/software/sempre/). * The rules were created based off of the webquestions * training set, and tested against the sentences in the QuestionToStatementTranslatorTest. * If something fails, please add it to the test when you fix it! * If you change something here, please validate it wit the test! * </p> * * @author Gabor Angeli */ @SuppressWarnings("unchecked") public class QuestionToStatementTranslator { public static class UnknownTokenMarker implements CoreAnnotation<Boolean> { @Override public Class<Boolean> getType() { return Boolean.class; } } /** The missing word marker, when the object of the sentence is not type constrained. */ private final CoreLabel WORD_MISSING = new CoreLabel(){{ setWord("thing"); setValue("thing"); setLemma("thing"); setTag("NN"); setNER("O"); setIndex(-1); setBeginPosition(-1); setEndPosition(-1); set(UnknownTokenMarker.class, true); }}; /** The missing word marker typed as a location. */ private final CoreLabel WORD_MISSING_LOCATION = new CoreLabel(){{ setWord("location"); setValue("location"); setLemma("location"); setTag("NNP"); setNER("O"); setIndex(-1); setBeginPosition(-1); setEndPosition(-1); set(UnknownTokenMarker.class, true); }}; /** The missing word marker typed as a person. */ private final CoreLabel WORD_MISSING_PERSON = new CoreLabel(){{ setWord("person"); setValue("person"); setLemma("person"); setTag("NNP"); setNER("O"); setIndex(-1); setBeginPosition(-1); setEndPosition(-1); set(UnknownTokenMarker.class, true); }}; /** The missing word marker typed as a time. */ private final CoreLabel WORD_MISSING_TIME = new CoreLabel(){{ setWord("time"); setValue("time"); setLemma("time"); setTag("NN"); setNER("O"); setIndex(-1); setBeginPosition(-1); setEndPosition(-1); set(UnknownTokenMarker.class, true); }}; /** The word "," as a CoreLabel */ private final CoreLabel WORD_COMMA = new CoreLabel(){{ setWord(","); setValue(","); setLemma(","); setTag(","); setNER("O"); setIndex(-1); setBeginPosition(-1); setEndPosition(-1); }}; /** The word "from" as a CoreLabel */ private final CoreLabel WORD_FROM = new CoreLabel(){{ setWord("from"); setValue("from"); setLemma("from"); setTag("IN"); setNER("O"); setIndex(-1); setBeginPosition(-1); setEndPosition(-1); }}; /** The word "at" as a CoreLabel */ private final CoreLabel WORD_AT = new CoreLabel(){{ setWord("at"); setValue("at"); setLemma("at"); setTag("IN"); setNER("O"); setIndex(-1); setBeginPosition(-1); setEndPosition(-1); }}; /** The word "in" as a CoreLabel */ private final CoreLabel WORD_IN = new CoreLabel(){{ setWord("in"); setValue("in"); setLemma("in"); setTag("IN"); setNER("O"); setIndex(-1); setBeginPosition(-1); setEndPosition(-1); }}; private final Set<String> fromNotAtDict = Collections.unmodifiableSet(new HashSet<String>() {{ add("funding"); add("oil"); }}); /** * The pattern for "what is ..." sentences. * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#processWhatIs(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) */ private final TokenSequencePattern triggerWhatIs = TokenSequencePattern.compile( "[{lemma:/what|which/; tag:/W.*/}] " + "(?$answer_type [tag:/N.*/]+)? " + "(?$be [{lemma:be}] )" + "(?: /the/ (?$answer_type [word:/name/]) [tag:/[PW].*/])? " + "(?$statement_body []+?) " + "(?$prep_num [!{tag:IN}] [tag:CD] )? " + "(?$suffix [tag:/[RI].*/] )? " + "(?$punct [word:/[?\\.!]/])"); /** * Process sentences matching the "what is ..." pattern. * * @param matcher The matcher that matched the pattern. * * @return The converted statement. * * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#triggerWhatIs */ private List<CoreLabel> processWhatIs(TokenSequenceMatcher matcher) { // Grab the body of the sentence List<CoreLabel> body = (List<CoreLabel>) matcher.groupNodes("$statement_body"); // Add the "be" token // [Gabor]: This is black magic -- if the "be" got misplaced, God help us all. // [Gabor]: Mostly you. You'll need most of the help. List<CoreLabel> be = (List<CoreLabel>) matcher.groupNodes("$be"); List<CoreLabel> suffix = (List<CoreLabel>) matcher.groupNodes("$suffix"); boolean addedBe = false; boolean addedSuffix = false; for (int i = 1; i < body.size(); ++i) { CoreLabel tokI = body.get(i); if (tokI.tag() != null && (tokI.tag().startsWith("V") || (tokI.tag().startsWith("J") && suffix != null) || (tokI.tag().startsWith("D") && suffix != null) || (tokI.tag().startsWith("R") && suffix != null) )) { body.add(i, be.get(0)); i += 1; if (suffix != null) { while (i < body.size() && body.get(i).tag() != null && (body.get(i).tag().startsWith("J") || body.get(i).tag().startsWith("V") || body.get(i).tag().startsWith("R") || body.get(i).tag().startsWith("N") || body.get(i).tag().startsWith("D")) && !body.get(i).tag().equals("VBG")) { i += 1; } body.add(i, suffix.get(0)); addedSuffix = true; } addedBe = true; break; } } // Tweak to handle dropped prepositions List<CoreLabel> prepNum = (List<CoreLabel>) matcher.groupNodes("$prep_num"); if (prepNum != null) { body.add(prepNum.get(0)); body.add(WORD_IN); body.add(prepNum.get(1)); } // Add the "be" and suffix if (!addedBe) { body.addAll(be); } if (!addedSuffix && suffix != null) { body.addAll(suffix); } // Grab the object List<CoreLabel> objType = (List<CoreLabel>) matcher.groupNodes("$answer_type"); // (try to insert the object earlier) int i = body.size() - 1; while (i >= 1 && body.get(i).tag() != null && (body.get(i).tag().startsWith("N") || body.get(i).tag().startsWith("J"))) { i -= 1; } // (actually insert the object) if (objType == null || objType.isEmpty() || (objType.size() == 1 && objType.get(0).word().equals("name"))) { // (case: untyped) if (i < body.size() - 1 && body.get(i).tag() != null && body.get(i).tag().startsWith("IN")) { body.add(i, WORD_MISSING); } else { body.add(WORD_MISSING); } } else { // (case: typed) for (CoreLabel obj : objType) { obj.set(UnknownTokenMarker.class, true); } body.addAll(objType); } // Return return body; } /** * The pattern for "what/which NN is ..." sentences. * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#processWhNNIs(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) */ private final TokenSequencePattern triggerWhNNIs = TokenSequencePattern.compile( "[{lemma:/what|which/; tag:/W.*/}] " + "(?$answer_type [!{lemma:be}]+) " + "(?$be [{lemma:be}] [{tag:/[VRIJ].*/}] ) " + "(?$statement_body []+?) " + "(?$punct [word:/[?\\.!]/])"); /** * Process sentences matching the "what NN is ..." pattern. * * @param matcher The matcher that matched the pattern. * * @return The converted statement. * * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#triggerWhNNIs */ private List<CoreLabel> processWhNNIs(TokenSequenceMatcher matcher) { List<CoreLabel> sentence = (List<CoreLabel>) matcher.groupNodes("$answer_type"); for (CoreLabel lbl : sentence) { lbl.set(UnknownTokenMarker.class, true); } sentence.addAll((Collection<CoreLabel>) matcher.groupNodes("$be")); sentence.addAll((Collection<CoreLabel>) matcher.groupNodes("$statement_body")); return sentence; } /** * The pattern for "what/which NN have ..." sentences. * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#processWhNNHave(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) */ private final TokenSequencePattern triggerWhNNHave = TokenSequencePattern.compile( "[{lemma:/what|which/; tag:/W.*/}] " + "(?$answer_type [!{tag:/V.*/}]+) " + "(?$have [{lemma:have} | {lemma:do}] ) " + "(?$pre_verb [!{tag:/V.*/}]+ ) " + "(?$verb [{tag:/V.*/}] [{tag:IN}]? ) " + "(?$post_verb []+ )? " + "(?$punct [word:/[?\\.!]/])"); /** * Process sentences matching the "what NN has ..." pattern. * * @param matcher The matcher that matched the pattern. * * @return The converted statement. * * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#triggerWhNNHave */ private List<CoreLabel> processWhNNHave(TokenSequenceMatcher matcher) { List<CoreLabel> sentence = new ArrayList<>(); // Add prefix sentence.addAll((Collection<CoreLabel>) matcher.groupNodes("$pre_verb")); // Add have/do List<CoreLabel> have = (List<CoreLabel>) matcher.groupNodes("$have"); if (have != null && have.size() > 0 && have.get(0).lemma() != null && have.get(0).lemma().equals("have")) { sentence.addAll((Collection<CoreLabel>) matcher.groupNodes("$have")); } // Compute answer type List<CoreLabel> answer = (List<CoreLabel>) matcher.groupNodes("$answer_type"); if (answer != null) { for (CoreLabel lbl : answer) { lbl.set(UnknownTokenMarker.class, true); } } // Add verb + Answer List<CoreLabel> verb = (List<CoreLabel>) matcher.groupNodes("$verb"); List<CoreLabel> post = (List<CoreLabel>) matcher.groupNodes("$post_verb"); if (verb.size() < 2 || post == null || post.size() == 0 || post.get(0).tag() == null || post.get(0).tag().equals("CD")) { sentence.addAll(verb); if (answer == null) { sentence.add(WORD_MISSING); } else { sentence.addAll(answer); } } else { sentence.add(verb.get(0)); if (answer == null) { sentence.add(WORD_MISSING); } else { sentence.addAll(answer); } sentence.addAll(verb.subList(1, verb.size())); } // Add postfix if (post != null) { if (post.size() == 1 && post.get(0).tag() != null && post.get(0).tag().equals("CD")) { sentence.add(WORD_IN); } sentence.addAll(post); } // Return return sentence; } /** * The pattern for "what/which NN have NN ..." sentences. * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#processWhNNHaveNN(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) */ private final TokenSequencePattern triggerWhNNHaveNN = TokenSequencePattern.compile( "[{lemma:/what|which/; tag:/W.*/}] " + "(?$answer_type [tag:/N.*/]+) " + "(?$have [{lemma:have}] ) " + "(?$statement_body [!{tag:/V.*/}]+?) " + "(?$punct [word:/[?\\.!]/])"); /** * Process sentences matching the "what NN have NN ..." pattern. * * @param matcher The matcher that matched the pattern. * * @return The converted statement. * * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#triggerWhNNHaveNN */ private List<CoreLabel> processWhNNHaveNN(TokenSequenceMatcher matcher) { List<CoreLabel> sentence = (List<CoreLabel>) matcher.groupNodes("$answer_type"); for (CoreLabel lbl : sentence) { lbl.set(UnknownTokenMarker.class, true); } sentence.addAll((Collection<CoreLabel>) matcher.groupNodes("$have")); sentence.addAll((Collection<CoreLabel>) matcher.groupNodes("$statement_body")); return sentence; } /** * The pattern for "what is there ..." sentences. * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#processWhatIsThere(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) */ private final TokenSequencePattern triggerWhatIsThere = TokenSequencePattern.compile( "[{lemma:/what|which/; tag:/W.*/}] " + "(?$answer_type [tag:/N.*/]+)? " + "(?$be [{lemma:be}] )" + "(?$there [{lemma:there; tag:RB}] ) " + "(?$adjmod [{tag:/[JN].*/}] )? " + "(?$to_verb [{tag:TO}] [{tag:/V.*/}] )? " + "(?$statement_body [{tag:IN}] []+?) " + "(?$punct [word:/[?\\.!]/])"); /** * Process sentences matching the "what is ..." pattern. * * @param matcher The matcher that matched the pattern. * * @return The converted statement. * * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#triggerWhatIsThere */ private List<CoreLabel> processWhatIsThere(TokenSequenceMatcher matcher) { List<CoreLabel> optSpan; // Grab the prefix of the sentence List<CoreLabel> sentence = (List<CoreLabel>) matcher.groupNodes("$there"); sentence.addAll((List<CoreLabel>) matcher.groupNodes("$be")); // Grab the unknown term if ((optSpan = (List<CoreLabel>) matcher.groupNodes("$adjmod")) != null) { sentence.addAll(optSpan); } sentence.add(WORD_MISSING); // Add body if ((optSpan = (List<CoreLabel>) matcher.groupNodes("$to_verb")) != null) { sentence.addAll(optSpan); } sentence.addAll((Collection<CoreLabel>) matcher.groupNodes("$statement_body")); // Return return sentence; } /** * The pattern for "where do..." sentences. * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#processWhereDo(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) */ private final TokenSequencePattern triggerWhereDo = TokenSequencePattern.compile( "[{lemma:where; tag:/W.*/}] " + "(?$do [ {lemma:/do/} ]) " + "(?$statement_body []+?) " + "(?$at [tag:/[IT].*/] )? " + "(?$loc [tag:/N.*/] )*? " + "(?$punct [word:/[?\\.!]/])" ); /** * * Process sentences matching the "where do ..." pattern. * * @param matcher The matcher that matched the pattern. * * @return The converted statement. * * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#triggerWhereDo */ private List<CoreLabel> processWhereDo(TokenSequenceMatcher matcher) { // Get the "at" preposition and the "location" missing marker to use List<CoreLabel> specloc = (List<CoreLabel>) matcher.groupNodes("$loc"); CoreLabel wordAt = WORD_AT; CoreLabel missing = WORD_MISSING_LOCATION; if (specloc != null && fromNotAtDict.contains(specloc.get(0).word())) { wordAt = WORD_FROM; missing = WORD_MISSING; } // Grab the prefix of the sentence List<CoreLabel> sentence = (List<CoreLabel>) matcher.groupNodes("$statement_body"); // (check if we should be looking for a location) for (CoreLabel lbl : sentence) { if ("name".equals(lbl.word())) { missing = WORD_MISSING; } } // Add the "at" part List<CoreLabel> at = (List<CoreLabel>) matcher.groupNodes("$at"); if (at != null && at.size() > 0) { sentence.addAll(at); } else { if (specloc != null) { sentence.addAll(specloc); } sentence.add(wordAt); } // Add the location sentence.add(missing); // Add an optional specifier location if (specloc != null && at != null) { sentence.add(WORD_COMMA); sentence.addAll(specloc); } // Return return sentence; } /** * The pattern for "where is..." sentences. * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#processWhereIs(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) */ private final TokenSequencePattern triggerWhereIs = TokenSequencePattern.compile( "[{lemma:where; tag:/W.*/}] " + "(?$be [ {lemma:/be/} ]) " + "(?$initial_verb [tag:/[VJ].*/] )? " + "(?$statement_body []+?) " + "(?$ignored [lemma:locate] [tag:IN] [word:a]? [word:map]? )? " + "(?$final_verb [tag:/[VJ].*/] )? " + "(?$at [tag:IN] )? " + "(?$punct [word:/[?\\.!]/])" ); /** * * Process sentences matching the "where is ..." pattern. * * @param matcher The matcher that matched the pattern. * * @return The converted statement. * * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#triggerWhereIs */ private List<CoreLabel> processWhereIs(TokenSequenceMatcher matcher) { // Grab the prefix of the sentence List<CoreLabel> sentence = (List<CoreLabel>) matcher.groupNodes("$statement_body"); // Add the "is" part List<CoreLabel> be = (List<CoreLabel>) matcher.groupNodes("$be"); sentence.addAll(be); // Add the optional final verb List<CoreLabel> verb = (List<CoreLabel>) matcher.groupNodes("$final_verb"); if (verb != null) { sentence.addAll(verb); } // Add the optional initial verb (from disfluent questions!) verb = (List<CoreLabel>) matcher.groupNodes("$initial_verb"); if (verb != null) { sentence.addAll(verb); } // Add the "at" part List<CoreLabel> at = (List<CoreLabel>) matcher.groupNodes("$at"); if (at != null && at.size() > 0) { sentence.addAll(at); } else { sentence.add(WORD_AT); } // Add the location sentence.add(WORD_MISSING_LOCATION); // Return return sentence; } /** * The pattern for "who is..." sentences. * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#processWhoIs(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) */ private final TokenSequencePattern triggerWhoIs = TokenSequencePattern.compile( "[{lemma:who; tag:/W.*/}] " + "(?$be [ {lemma:/be/} ] ) " + "(?$prep [ {tag:/IN|V.*/} ] )? " + "(?$statement_body []+?) " + "(?$final_verb [tag:/V.*/] [tag:/[IRT].*/] )? " + "(?$final_verb [tag:VBG] )? " + "(?$now [tag:RB] )? " + "(?$prep_num [!{tag:IN}] [tag:CD] )? " + "(?$punct [word:/[?\\.!]/])" ); /** * * Process sentences matching the "who is ..." pattern. * * @param matcher The matcher that matched the pattern. * * @return The converted statement. * * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#triggerWhoIs */ private List<CoreLabel> processWhoIs(TokenSequenceMatcher matcher) { List<CoreLabel> sentence = new ArrayList<>(); List<CoreLabel> prep = (List<CoreLabel>) matcher.groupNodes("$prep"); boolean addedBe = false; if (prep != null && !prep.isEmpty()) { // Add the person sentence.add(WORD_MISSING_PERSON); // Add the "is" part List<CoreLabel> be = (List<CoreLabel>) matcher.groupNodes("$be"); sentence.addAll(be); addedBe = true; // Add the preposition sentence.addAll(prep); // Grab the prefix of the sentence sentence.addAll((List<CoreLabel>) matcher.groupNodes("$statement_body")); } else { // Grab the prefix of the sentence sentence.addAll((List<CoreLabel>) matcher.groupNodes("$statement_body")); // Tweak to handle dropped prepositions List<CoreLabel> prepNum = (List<CoreLabel>) matcher.groupNodes("$prep_num"); if (prepNum != null) { sentence.add(prepNum.get(0)); sentence.add(WORD_IN); sentence.add(prepNum.get(1)); } // Add the "is" part List<CoreLabel> be = (List<CoreLabel>) matcher.groupNodes("$be"); if (sentence.size() > 1 && !sentence.get(sentence.size() - 1).word().equals("be")) { sentence.addAll(be); addedBe = true; } // Add the final verb part List<CoreLabel> verb = (List<CoreLabel>) matcher.groupNodes("$final_verb"); if (verb != null) { if (verb.size() > 1 && verb.get(verb.size() - 1).word().equals("too")) { // Fix common typo verb.get(verb.size() - 1).setWord("to"); verb.get(verb.size() - 1).setValue("to"); verb.get(verb.size() - 1).setLemma("to"); verb.get(verb.size() - 1).setTag("IN"); } sentence.addAll(verb); } // Add the person sentence.add(WORD_MISSING_PERSON); } // Add a final modifier (e.g., "now") List<CoreLabel> now = (List<CoreLabel>) matcher.groupNodes("$now"); if (now != null) { sentence.addAll(now); } // Insert "was" before first verb, if applicable if (!addedBe) { for (int i = 0; i < sentence.size(); ++i) { if (sentence.get(i).tag() != null && sentence.get(i).tag().startsWith("V")) { sentence.add(i, (CoreLabel) matcher.groupNodes("$be").get(0)); break; } } } // Return return sentence; } /** * The pattern for "who did..." sentences. * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#processWhoDid(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) */ private final TokenSequencePattern triggerWhoDid = TokenSequencePattern.compile( "[{lemma:who; tag:/W.*/}] " + "(?$do [ {lemma:/do/} ] ) " + "(?$statement_body []+?) " + "(?$now [tag:RB] )? " + "(?$punct [word:/[?\\.!]/])" ); /** * * Process sentences matching the "who did ..." pattern. * * @param matcher The matcher that matched the pattern. * * @return The converted statement. * * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#triggerWhoDid */ private List<CoreLabel> processWhoDid(TokenSequenceMatcher matcher) { // Get the body List<CoreLabel> sentence = (List<CoreLabel>) matcher.groupNodes("$statement_body"); // Check if there is no main verb other than "do" // If it doesn't, then the sentence should be "person do ...." boolean hasVerb = false; for (CoreLabel w : sentence) { if (w.tag() != null && w.tag().startsWith("V")) { hasVerb = true; } } if (!hasVerb) { sentence.add(0, WORD_MISSING_PERSON); sentence.add(1, (CoreLabel) matcher.groupNodes("$do").get(0)); return sentence; } // Add the missing word // (in front of the PPs) boolean addedPerson = false; if (sentence.size() > 0 && sentence.get(sentence.size() - 1).tag() != null && !sentence.get(sentence.size() - 1).tag().startsWith("I")) { for (int i = 0; i < sentence.size() - 1; ++i) { if (sentence.get(i).tag() != null && (sentence.get(i).tag().equals("IN") || sentence.get(i).word().equals("last") || sentence.get(i).word().equals("next") || sentence.get(i).word().equals("this"))) { sentence.add(i, WORD_MISSING_PERSON); addedPerson = true; break; } } } // (at the end of the sentence) if (!addedPerson) { sentence.add(WORD_MISSING_PERSON); } // Add "now" / "first" / etc. List<CoreLabel> now = (List<CoreLabel>) matcher.groupNodes("$now"); if (now != null) { sentence.addAll(now); } // Return return sentence; } /** * The pattern for "where is..." sentences. * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#processWhatDo(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) */ private final TokenSequencePattern triggerWhatDo = TokenSequencePattern.compile( "[{lemma:/what|which/; tag:/W.*/}] " + "(?$do [ {lemma:/do/} ]) " + "(?$pre_do [ !{lemma:do} & !{tag:IN} ]+) " + "(?$mid_do [ {lemma:do} ] )? " + "(?$in [ {tag:IN} ] )? " + "(?$post_do []+ )? " + "(?$punct [word:/[?\\.!]/])" ); /** * * Process sentences matching the "what do ..." pattern. * * @param matcher The matcher that matched the pattern. * * @return The converted statement. * * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#triggerWhatDo */ private List<CoreLabel> processWhatDo(TokenSequenceMatcher matcher) { // Grab the prefix of the sentence List<CoreLabel> sentence = (List<CoreLabel>) matcher.groupNodes("$pre_do"); // Add the optional middle do List<CoreLabel> midDo = (List<CoreLabel>) matcher.groupNodes("$mid_do"); if (midDo != null) { sentence.addAll((List<CoreLabel>) matcher.groupNodes("$do")); } // Add the thing (not end of sentence) if (matcher.groupNodes("$post_do") != null) { sentence.add(WORD_MISSING); } // Add IN token List<CoreLabel> midIN = (List<CoreLabel>) matcher.groupNodes("$in"); if (midIN != null) { sentence.addAll(midIN); } // Add the thing (end of sentence) if (matcher.groupNodes("$post_do") == null) { if (sentence.size() > 1 && "off".equals(sentence.get(sentence.size() - 1).word())) { // Fix common typo sentence.get(sentence.size() - 1).setWord("of"); sentence.get(sentence.size() - 1).setValue("of"); sentence.get(sentence.size() - 1).setLemma("of"); sentence.get(sentence.size() - 1).setTag("IN"); } sentence.add(WORD_MISSING); } // Add post do List<CoreLabel> postDo = (List<CoreLabel>) matcher.groupNodes("$post_do"); if (postDo != null) { sentence.addAll(postDo); } // Tweak to handle dropped prepositions if (sentence.size() > 2 && !"IN".equals(sentence.get(sentence.size() - 2).tag()) && "CD".equals(sentence.get(sentence.size() - 1).tag())) { sentence.add(sentence.size() - 1, WORD_IN); } // Return return sentence; } /** * The pattern for "when do..." sentences. * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#processWhenDo(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) */ private final TokenSequencePattern triggerWhenDo = TokenSequencePattern.compile( "[{lemma:when; tag:/W.*/}] " + "(?$do [ {lemma:/do/} ]) " + "(?$statement_body []+?) " + "(?$in [tag:/[IT].*/] )? " + "(?$punct [word:/[?\\.!]/])" ); /** * * Process sentences matching the "when do ..." pattern. * * @param matcher The matcher that matched the pattern. * * @return The converted statement. * * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#triggerWhenDo */ private List<CoreLabel> processWhenDo(TokenSequenceMatcher matcher) { // Grab the prefix of the sentence List<CoreLabel> sentence = (List<CoreLabel>) matcher.groupNodes("$statement_body"); // Add the "at" part List<CoreLabel> in = (List<CoreLabel>) matcher.groupNodes("$in"); if (in != null && in.size() > 0) { sentence.addAll(in); } else { sentence.add(WORD_IN); } // Add the location sentence.add(WORD_MISSING_TIME); // Return return sentence; } /** * The pattern for "what have..." sentences. * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#processWhereIs(edu.stanford.nlp.ling.tokensregex.TokenSequenceMatcher) */ private final TokenSequencePattern triggerWhatHave = TokenSequencePattern.compile( "[{lemma:what; tag:/W.*/}] " + "(?$have [ {lemma:/have/} ]) " + "(?$pre_verb [!{tag:/V.*/}]+ )? " + "(?$verb [tag:/V.*/] [tag:IN]? ) " + "(?$post_verb []+ )? " + "(?$punct [word:/[?\\.!]/])" ); /** * * Process sentences matching the "when do ..." pattern. * * @param matcher The matcher that matched the pattern. * * @return The converted statement. * * @see edu.stanford.nlp.naturalli.QuestionToStatementTranslator#triggerWhenDo */ private List<CoreLabel> processWhatHave(TokenSequenceMatcher matcher) { List<CoreLabel> sentence = new ArrayList<>(); // Grab the prefix of the sentence List<CoreLabel> preVerb = (List<CoreLabel>) matcher.groupNodes("$pre_verb"); if (preVerb != null) { sentence.addAll(preVerb); } // Add "thing have verb" or "have verb thing" if (sentence.size() == 0) { sentence.add(WORD_MISSING); sentence.addAll( (List<CoreLabel>) matcher.groupNodes("$have") ); sentence.addAll( (List<CoreLabel>) matcher.groupNodes("$verb") ); } else { sentence.addAll( (List<CoreLabel>) matcher.groupNodes("$have") ); sentence.addAll( (List<CoreLabel>) matcher.groupNodes("$verb") ); sentence.add(WORD_MISSING); } List<CoreLabel> postVerb = (List<CoreLabel>) matcher.groupNodes("$post_verb"); if (postVerb != null) { sentence.addAll(postVerb); } return sentence; } /** * Convert a question to a statement, if possible. * <ul> * <li>The question must have words, lemmas, and part of speech tags.</li> * <li>The question must have valid punctuation.</li> * </ul> * * @param question The question to convert to a statement. * @return A list of statement translations of the question. This is usually a singleton list. */ public List<List<CoreLabel>> toStatement(List<CoreLabel> question) { TokenSequenceMatcher matcher; if ((matcher = triggerWhatIsThere.matcher(question)).matches()) { // must come before triggerWhatIs return Collections.singletonList(processWhatIsThere(matcher)); } else if ((matcher = triggerWhNNIs.matcher(question)).matches()) { // must come before triggerWhatIs return Collections.singletonList(processWhNNIs(matcher)); } else if ((matcher = triggerWhNNHave.matcher(question)).matches()) { // must come before triggerWhatHave return Collections.singletonList(processWhNNHave(matcher)); } else if ((matcher = triggerWhNNHaveNN.matcher(question)).matches()) { // must come before triggerWhatHave return Collections.singletonList(processWhNNHaveNN(matcher)); } else if ((matcher = triggerWhatIs.matcher(question)).matches()) { return Collections.singletonList(processWhatIs(matcher)); } else if ((matcher = triggerWhatHave.matcher(question)).matches()) { return Collections.singletonList(processWhatHave(matcher)); } else if ((matcher = triggerWhereDo.matcher(question)).matches()) { return Collections.singletonList(processWhereDo(matcher)); } else if ((matcher = triggerWhereIs.matcher(question)).matches()) { return Collections.singletonList(processWhereIs(matcher)); } else if ((matcher = triggerWhoIs.matcher(question)).matches()) { return Collections.singletonList(processWhoIs(matcher)); } else if ((matcher = triggerWhoDid.matcher(question)).matches()) { return Collections.singletonList(processWhoDid(matcher)); } else if ((matcher = triggerWhatDo.matcher(question)).matches()) { return Collections.singletonList(processWhatDo(matcher)); } else if ((matcher = triggerWhenDo.matcher(question)).matches()) { return Collections.singletonList(processWhenDo(matcher)); } else { return Collections.emptyList(); } } }