package edu.stanford.nlp.international.spanish.pipeline;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeFactory;
import edu.stanford.nlp.trees.TreeNormalizer;
import edu.stanford.nlp.trees.international.spanish.SpanishTreeNormalizer;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon;
import edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern;
import edu.stanford.nlp.util.Pair;
/**
* Provides routines for "decompressing" further the expanded trees
* formed by multiword token splitting.
*
* Multiword token expansion leaves constituent words as siblings in a
* "flat" tree structure. This often represents an incorrect parse of
* the sentence. For example, the phrase "Ministerio de Finanzas" should
* not be parsed as a flat structure like
*
* (grup.nom (np00000 Ministerio) (sp000 de) (np00000 Finanzas))
*
* but rather a "deep" structure like
*
* (grup.nom (sp (prep (sp000 de))
* (sn (grup.nom (np0000 Finanzas)))))
*
* This class provides methods for detecting common linguistic patterns
* that should be expanded in this way.
*/
public class MultiWordTreeExpander {
/**
* Regular expression to match groups inside which we want to expand things
*/
private static final String CANDIDATE_GROUPS = "(^grup\\.(adv|c[cs]|[iwz]|nom|prep|pron|verb)|\\.inter)";
private static final String PREPOSITIONS =
"(por|para|pro|al?|del?|con(?:tra)?|sobre|en(?:tre)?|hacia|sin|según|hasta|bajo)";
private final TregexPattern parentheticalExpression = TregexPattern.compile(
"fpa=left > /^grup\\.nom$/ " + "$++ fpt=right");
private final TsurgeonPattern groupParentheticalExpression
= Tsurgeon.parseOperation("createSubtree grup.nom.inter4 left right");
/**
* Yes, some multiword tokens contain multiple clauses..
*/
private final TregexPattern multipleClauses
= TregexPattern.compile(
// Nested nominal group containing period punctuation
"/^grup\\.nom/ > /^grup\\.nom/ < (fp !$-- fp $- /^[^g]/=right1 $+ __=left2)" +
// Match boundaries for subtrees created
" <, __=left1 <` __=right2");
private final TsurgeonPattern expandMultipleClauses
= Tsurgeon.parseOperation("[createSubtree grup.nom left1 right1]" +
"[createSubtree grup.nom left2 right2]");
private final TregexPattern prepositionalPhrase
= TregexPattern.compile(// Match candidate preposition
"sp000=tag < /(?i)^" + PREPOSITIONS + "$/" +
// Headed by a group that was generated from
// multi-word token expansion and that we
// wish to expand further
" > (/" + CANDIDATE_GROUPS + "/ <- __=right)" +
// With an NP on the left (-> this is a
// prep. phrase) and not preceded by any
// other prepositions
" $+ /^([adnswz]|p[ipr])/=left !$-- sp000");
private final TregexPattern leadingPrepositionalPhrase
= TregexPattern.compile(// Match candidate preposition
"sp000=tag < /(?i)^" + PREPOSITIONS + "$/" +
// Which is the first child in a group that
// was generated from multi-word token
// expansion and that we wish to expand
// further
" >, (/" + CANDIDATE_GROUPS + "/ <- __=right)" +
// With an NP on the left (-> this is a
// prep. phrase) and not preceded by any
// other prepositions
" $+ /^([adnswz]|p[ipr])/=left !$-- sp000");
/**
* First step in expanding prepositional phrases: group NP to right of
* preposition under a `grup.nom` subtree (specially labeled for now
* so that we can target it in the next step)
*/
private final TsurgeonPattern expandPrepositionalPhrase1 =
Tsurgeon.parseOperation("[createSubtree grup.nom.inter left right]");
/**
* Matches intermediate prepositional phrase structures as produced by
* the first step of expansion.
*/
private final TregexPattern intermediatePrepositionalPhrase
= TregexPattern.compile("sp000=preptag $+ /^grup\\.nom\\.inter$/=gn");
/**
* Second step: replace intermediate prepositional phrase structure
* with final result.
*/
private final TsurgeonPattern expandPrepositionalPhrase2 =
Tsurgeon.parseOperation("[adjoinF (sp (prep T=preptarget) (sn foot@)) gn]" +
"[relabel gn /.inter$//]" +
"[replace preptarget preptag]" +
"[delete preptag]");
private final TregexPattern prepositionalVP =
TregexPattern.compile("sp000=tag < /(?i)^(para|al?|del?)$/" +
" > (/" + CANDIDATE_GROUPS + "/ <- __=right)" +
" $+ vmn0000=left !$-- sp000");
private final TsurgeonPattern expandPrepositionalVP1 =
Tsurgeon.parseOperation("[createSubtree S.inter left right]" +
"[adjoinF (infinitiu foot@) left]");
private final TregexPattern intermediatePrepositionalVP =
TregexPattern.compile("sp000=preptag $+ /^S\\.inter$/=si");
private final TsurgeonPattern expandPrepositionalVP2 =
Tsurgeon.parseOperation("[adjoin (sp prep=target S@) si] [move preptag >0 target]");
private final TregexPattern conjunctPhrase =
TregexPattern.compile("cc=cc" +
// In one of our expanded phrases (match
// bounds of this expanded phrase; these form
// the left edge of first new subtree and the
// right edge of the second new subtree)
" > (/^grup\\.nom/ <, __=left1 <` __=right2)" +
// Fetch more bounds: node to immediate left
// of cc is the right edge of the first new
// subtree, and node to right of cc is the
// left edge of the second new subtree
//
// NB: left1 may the same as right1; likewise
// for the second tree
" $- /^[^g]/=right1 $+ /^[^g]/=left2");
private final TsurgeonPattern expandConjunctPhrase =
Tsurgeon.parseOperation("[adjoinF (conj foot@) cc]" +
"[createSubtree grup.nom.inter2 left1 right1]" +
"[createSubtree grup.nom.inter2 left2 right2]");
/**
* Simple intermediate conjunct: a constituent which heads a single
* substantive
*/
private final TregexPattern intermediateSubstantiveConjunct =
TregexPattern.compile("/grup\\.nom\\.inter2/=target <: /^[dnpw]/");
/**
* Rename simple intermediate conjunct as a `grup.nom`
*/
private final TsurgeonPattern expandIntermediateSubstantiveConjunct =
Tsurgeon.parseOperation("[relabel target /grup.nom/]");
/**
* Simple intermediate conjunct: a constituent which heads a single
* adjective
*/
private final TregexPattern intermediateAdjectiveConjunct =
TregexPattern.compile("/^grup\\.nom\\.inter2$/=target <: /^a/");
/**
* Rename simple intermediate adjective conjunct as a `grup.a`
*/
private final TsurgeonPattern expandIntermediateAdjectiveConjunct =
Tsurgeon.parseOperation("[relabel target /grup.a/]");
/**
* Match parts of an expanded conjunct which must be labeled as a noun
* phrase given their children.
*/
private final TregexPattern intermediateNounPhraseConjunct =
TregexPattern.compile("/^grup\\.nom\\.inter2$/=target < /^s[pn]$/");
private final TsurgeonPattern expandIntermediateNounPhraseConjunct =
Tsurgeon.parseOperation("[relabel target sn]");
/**
* Intermediate conjunct: verb
*/
private final TregexPattern intermediateVerbConjunct =
TregexPattern.compile("/^grup\\.nom\\.inter2$/=gn <: /^vmi/");
private final TsurgeonPattern expandIntermediateVerbConjunct =
Tsurgeon.parseOperation("[adjoin (S (grup.verb@)) gn]");
/**
* Match parts of an expanded conjunct which should be labeled as
* nominal groups.
*/
private final TregexPattern intermediateNominalGroupConjunct =
TregexPattern.compile("/^grup\\.nom\\.inter2$/=target !< /^[^n]/");
private final TsurgeonPattern expandIntermediateNominalGroupConjunct =
Tsurgeon.parseOperation("[relabel target /grup.nom/]");
/**
* Match articles contained within nominal groups of substantives so
* that they can be moved out
*/
private final TregexPattern articleLeadingNominalGroup =
TregexPattern.compile("/^d[aip]/=art >, (/^grup\\.nom$/=ng > sn)");
private final TsurgeonPattern expandArticleLeadingNominalGroup =
Tsurgeon.parseOperation("[insert (spec=target) $+ ng] [move art >0 target]");
private final TregexPattern articleInsideOrphanedNominalGroup =
TregexPattern.compile("/^d[aip]/=d >, (/^grup\\.nom/=ng !> sn)");
private final TsurgeonPattern expandArticleInsideOrphanedNominalGroup =
Tsurgeon.parseOperation("[adjoinF (sn=sn spec=spec foot@) ng] [move d >0 spec]");
private final TregexPattern determinerInsideNominalGroup =
TregexPattern.compile("/^d[^n]/=det >, (/^grup\\.nom/=ng > sn) $ __");
private final TsurgeonPattern expandDeterminerInsideNominalGroup =
Tsurgeon.parseOperation("[insert (spec=target) $+ ng] [move det >0 target]");
// "en opinion del X," "además del Y"
private final TregexPattern contractionTrailingIdiomBeforeNominalGroup
= TregexPattern.compile("sp000 >` (/^grup\\.prep$/ > (__ $+ /^grup\\.nom/=ng)) < /^(de|a)l$/=contraction");
// -> "(en opinion de) (el X)," "(además de) (el Y)"
private final TsurgeonPattern joinArticleWithNominalGroup
= Tsurgeon.parseOperation("[relabel contraction /l//] [adjoinF (sn (spec (da0000 el)) foot@) ng]");
private final TregexPattern contractionInSpecifier
= TregexPattern.compile("sp000=parent < /(?i)^(a|de)l$/=contraction > spec");
private final TregexPattern delTodo = TregexPattern.compile("del=contraction . todo > sp000=parent");
// "del X al Y"
private final TregexPattern contractionInRangePhrase
= TregexPattern.compile("sp000 < /(?i)^(a|de)l$/=contraction >: (conj $+ (/^grup\\.(w|nom)/=group))");
private final TsurgeonPattern expandContractionInRangePhrase
= Tsurgeon.parseOperation("[relabel contraction /(?i)l//] [adjoinF (sn (spec (da0000 el)) foot@) group]");
/**
* Operation to extract article from contraction and just put it next to the container
*/
private final TsurgeonPattern extendContraction
= Tsurgeon.parseOperation("[relabel contraction /l//] [insert (da0000 el) $- parent]");
// ---------
// Final cleanup operations
private final TregexPattern terminalPrepositions
= TregexPattern.compile("sp000=sp < /" + PREPOSITIONS + "/ >- (/^grup\\.nom/ >+(/^grup\\.nom/) sn=sn >>- =sn)");
private final TsurgeonPattern extractTerminalPrepositions = Tsurgeon.parseOperation(
"[insert (prep=prep) $- sn] [move sp >0 prep]");
/**
* Match terminal prepositions in prepositional phrases: "a lo largo de"
*/
private final TregexPattern terminalPrepositions2
= TregexPattern.compile("prep=prep >` (/^grup\\.nom$/ >: (sn=sn > /^(grup\\.prep|sp)$/))");
private final TsurgeonPattern extractTerminalPrepositions2
= Tsurgeon.parseOperation("move prep $- sn");
/**
* Match terminal prepositions in infinitive clause within prepositional phrase: "a partir de," etc.
*/
private final TregexPattern terminalPrepositions3
= TregexPattern.compile("sp000=sp $- infinitiu >` (S=S >` /^(grup\\.prep|sp)$/)");
private final TsurgeonPattern extractTerminalPrepositions3
= Tsurgeon.parseOperation("[insert (prep=prep) $- S] [move sp >0 prep]");
private final TregexPattern adverbNominalGroups = TregexPattern.compile("/^grup\\.nom./=ng <: /^r[gn]/=r");
private final TsurgeonPattern replaceAdverbNominalGroup = Tsurgeon.parseOperation("replace ng r");
/**
* Match blocks of only adjectives (one or more) with a nominal group parent. These constituents should be rewritten
* beneath an adjectival group constituent.
*/
private final TregexPattern adjectiveSpanInNominalGroup
= TregexPattern.compile("/^grup\\.nom/=ng <, aq0000=left <` aq0000=right !< /^[^a]/");
/**
* Match dependent clauses mistakenly held under nominal groups ("lo que X")
*/
private final TregexPattern clauseInNominalGroup
= TregexPattern.compile("lo . (que > (pr000000=pr >, /^grup\\.nom/=ng $+ (/^v/=vb >` =ng)))");
private final TsurgeonPattern labelClause
= Tsurgeon.parseOperation("[relabel ng S] [adjoinF (relatiu foot@) pr] [adjoinF (grup.verb foot@) vb]");
/**
* Infinitive clause mistakenly held under nominal group
*/
private final TregexPattern clauseInNominalGroup2 = TregexPattern.compile("/^grup\\.nom/=gn $- spec <: /^vmn/");
private final TsurgeonPattern labelClause2 = Tsurgeon.parseOperation("[adjoin (S (infinitiu@)) gn]");
private final TregexPattern clauseInNominalGroup3 = TregexPattern.compile("sn=sn <, (/^vmn/=inf $+ (sp >` =sn))");
private final TsurgeonPattern labelClause3
= Tsurgeon.parseOperation("[relabel sn S] [adjoinF (infinitiu foot@) inf]");
private final TregexPattern loneAdjectiveInNominalGroup
= TregexPattern.compile("/^a/=a > /^grup\\.nom/ $ /^([snwz]|p[ipr])/ !$ /^a/");
private final TsurgeonPattern labelAdjective = Tsurgeon.parseOperation("[adjoinF (s.a (grup.a foot@)) a]");
private final TsurgeonPattern groupAdjectives = Tsurgeon.parseOperation("createSubtree (s.a grup.a@) left right");
/**
* Some brute-force fixes:
*/
private final TregexPattern alMenos
= TregexPattern.compile("/(?i)^al$/ . /(?i)^menos$/ > (sp000 $+ rg > /^grup\\.adv$/=ga)");
private final TsurgeonPattern fixAlMenos
= Tsurgeon.parseOperation("replace ga (grup.adv (sp (prep (sp000 a)) (sn (spec (da0000 lo)) (grup.nom (s.a (grup.a (aq0000 menos)))))))");
private final TregexPattern todoLoContrario
= TregexPattern.compile("(__=ttodo < /(?i)^todo$/) $+ (__=tlo < /(?i)^lo$/ $+ (__=tcon < /(?i)^contrario$/))");
private final TsurgeonPattern fixTodoLoContrario
= Tsurgeon.parseOperation("[adjoin (sn (grup.nom (pp000000@))) tlo] [adjoin (grup.a (aq0000@)) tcon]");
/**
* Mark infinitives within verb groups ("hacer ver", etc.)
*/
private final TregexPattern infinitiveInVerbGroup
= TregexPattern.compile("/^grup\\.verb$/=grup < (/^v/ !$-- /^v/ $++ (/^vmn/=target !$++ /^vmn/))");
private final TsurgeonPattern markInfinitive = Tsurgeon.parseOperation("[adjoinF (infinitiu foot@) target]");
/**
* The corpus marks entire multiword verb tokens like "teniendo en
* cuenta" as gerunds / infinitives (by heading them with a
* constituent "gerundi" / "infinitiu"). Now that we've split into
* separate words, transfer this gerund designation so that it heads
* the verb only.
*/
private final TregexPattern floppedGerund
= TregexPattern.compile("/^grup\\.verb$/=grup >: gerundi=ger < (/^vmg/=vb !$ /^vmg/)");
private final TsurgeonPattern unflopFloppedGerund
= Tsurgeon.parseOperation("[adjoinF (gerundi foot@) vb] [replace ger grup]");
private final TregexPattern floppedInfinitive
= TregexPattern.compile("/^grup\\.verb$/=grup >: infinitiu=inf < (/^vmn/=vb !$ /^vmn/)");
private final TsurgeonPattern unflopFloppedInfinitive
= Tsurgeon.parseOperation("[adjoinF (infinitiu foot@) vb] [replace inf grup]");
/**
* Match `sn` constituents which can (should) be rewritten as nominal groups
*/
private final TregexPattern nominalGroupSubstantives =
TregexPattern.compile("sn=target < /^[adnwz]/ !< /^([^adnswz]|neg)/");
private final TregexPattern leftoverIntermediates =
TregexPattern.compile("/^grup\\.nom\\.inter/=target");
private final TsurgeonPattern makeNominalGroup =
Tsurgeon.parseOperation("[relabel target /grup.nom/]");
private final TregexPattern redundantNominalRewrite =
TregexPattern.compile("/^grup\\.nom$/ <: sn=child >: sn=parent");
private final TsurgeonPattern fixRedundantNominalRewrite =
Tsurgeon.parseOperation("[replace parent child]");
private final TregexPattern redundantPrepositionGroupRewrite =
TregexPattern.compile("/^grup\\.prep$/=parent <: sp=child >: prep");
private final TsurgeonPattern fixRedundantPrepositionGroupRewrite =
Tsurgeon.parseOperation("[relabel child /grup.prep/] [replace parent child]");
private final TregexPattern redundantPrepositionGroupRewrite2 = TregexPattern.compile("/^grup\\.prep$/=gp <: sp=sp");
private final TsurgeonPattern fixRedundantPrepositionGroupRewrite2 = Tsurgeon.parseOperation("replace gp sp");
/**
* Patterns in this list turn flat structures into intermediate forms
* which will eventually become deep phrase structures.
*/
private final List<Pair<TregexPattern, TsurgeonPattern>> firstStepExpansions = Arrays.asList(
// Should be first-ish
new Pair<>(parentheticalExpression, groupParentheticalExpression),
new Pair<>(multipleClauses, expandMultipleClauses),
new Pair<>(leadingPrepositionalPhrase,
expandPrepositionalPhrase1),
new Pair<>(conjunctPhrase, expandConjunctPhrase),
new Pair<>(prepositionalPhrase, expandPrepositionalPhrase1),
new Pair<>(prepositionalVP, expandPrepositionalVP1),
new Pair<>(contractionTrailingIdiomBeforeNominalGroup,
joinArticleWithNominalGroup),
new Pair<>(contractionInSpecifier, extendContraction),
new Pair<>(delTodo, extendContraction),
new Pair<>(contractionInRangePhrase,
expandContractionInRangePhrase),
// Should not happen until the last moment! The function words
// being targeted have weaker "scope" than others earlier
// targeted, and so we don't want to clump things around them
// until we know we have the right to clump
new Pair<>(articleLeadingNominalGroup,
expandArticleLeadingNominalGroup),
new Pair<>(articleInsideOrphanedNominalGroup,
expandArticleInsideOrphanedNominalGroup),
new Pair<>(determinerInsideNominalGroup,
expandDeterminerInsideNominalGroup)
);
/**
* Patterns in this list clean up "intermediate" phrase structures
* produced by previous step and produce something from them that
* looks like the rest of the corpus.
*/
private final List<Pair<TregexPattern, TsurgeonPattern>> intermediateExpansions = Arrays.asList(
new Pair<>(intermediatePrepositionalPhrase,
expandPrepositionalPhrase2),
new Pair<>(intermediatePrepositionalVP, expandPrepositionalVP2),
new Pair<>(intermediateSubstantiveConjunct,
expandIntermediateSubstantiveConjunct),
new Pair<>(intermediateAdjectiveConjunct,
expandIntermediateAdjectiveConjunct),
new Pair<>(intermediateNounPhraseConjunct,
expandIntermediateNounPhraseConjunct),
new Pair<>(intermediateVerbConjunct,
expandIntermediateVerbConjunct),
new Pair<>(intermediateNominalGroupConjunct,
expandIntermediateNominalGroupConjunct)
);
/**
* Patterns in this list perform last-minute cleanup of leftover
* grammar mistakes which this class created.
*/
private final List<Pair<TregexPattern, TsurgeonPattern>> finalCleanup = Arrays.asList(
new Pair<>(terminalPrepositions, extractTerminalPrepositions),
new Pair<>(terminalPrepositions2, extractTerminalPrepositions2),
new Pair<>(terminalPrepositions3, extractTerminalPrepositions3),
new Pair<>(nominalGroupSubstantives, makeNominalGroup),
new Pair<>(adverbNominalGroups, replaceAdverbNominalGroup),
new Pair<>(adjectiveSpanInNominalGroup, groupAdjectives),
new Pair<>(clauseInNominalGroup, labelClause),
new Pair<>(clauseInNominalGroup2, labelClause2),
new Pair<>(clauseInNominalGroup3, labelClause3),
new Pair<>(loneAdjectiveInNominalGroup, labelAdjective),
// Verb phrase-related cleanup.. order is important!
new Pair<>(infinitiveInVerbGroup, markInfinitive),
new Pair<>(floppedGerund, unflopFloppedGerund),
new Pair<>(floppedInfinitive, unflopFloppedInfinitive),
// Fixes for specific common phrases
new Pair<>(alMenos, fixAlMenos),
new Pair<>(todoLoContrario, fixTodoLoContrario),
// Lastly..
//
// These final fixes are not at all linguistically motivated -- just need to make the trees less dirty
new Pair<>(redundantNominalRewrite, fixRedundantNominalRewrite),
new Pair<>(redundantPrepositionGroupRewrite,
fixRedundantPrepositionGroupRewrite),
new Pair<>(redundantPrepositionGroupRewrite2,
fixRedundantPrepositionGroupRewrite2),
new Pair<>(leftoverIntermediates, makeNominalGroup)
);
/**
* Recognize candidate patterns for expansion in the given tree and
* perform the expansions. See the class documentation for more
* information.
*/
public Tree expandPhrases(Tree t, TreeNormalizer tn, TreeFactory tf) {
// Keep running this sequence of patterns until no changes are
// affected. We need this for nested expressions like "para tratar
// de regresar al empleo." This first step produces lots of
// "intermediate" tree structures which need to be cleaned up later.
Tree oldTree;
do {
oldTree = t.deepCopy();
t = Tsurgeon.processPatternsOnTree(firstStepExpansions, t);
} while (!t.equals(oldTree));
// Now clean up intermediate tree structures
t = Tsurgeon.processPatternsOnTree(intermediateExpansions, t);
// Normalize first to allow for contraction expansion, etc.
t = tn.normalizeWholeTree(t, tf);
// Final cleanup
t = Tsurgeon.processPatternsOnTree(finalCleanup, t);
return t;
}
}
// GOOD EXAMPLES
// incidentes . lamentables (nested articles near middle)
// chiquilla . vistosa (giant multiword at end)
// espejo . deformante (article fun at start)
// menor . coste (watch "Comisión del Mercado" thing at end)
// totalmente . evitables ("en opinion del" at end)
// TODO (corpus)
// epígrafe . Arte (flat!)
// TODO (parser)
// debería .. encima ("por encima de" parse, coordinated NP)
// manía .. catalán ("castellana" parsed as being under a participi constituent)