package edu.stanford.nlp.trees.international.spanish; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import edu.stanford.nlp.international.spanish.process.AnCoraPronounDisambiguator; import edu.stanford.nlp.international.spanish.SpanishVerbStripper; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.ling.HasTag; import edu.stanford.nlp.ling.HasWord; import edu.stanford.nlp.ling.Label; import edu.stanford.nlp.trees.*; import edu.stanford.nlp.trees.tregex.TregexMatcher; import edu.stanford.nlp.trees.tregex.TregexPattern; import edu.stanford.nlp.trees.tregex.tsurgeon.Tsurgeon; import edu.stanford.nlp.trees.tregex.tsurgeon.TsurgeonPattern; import java.util.function.Predicate; import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.StringUtils; /** * Normalize trees read from the AnCora Spanish corpus. */ public class SpanishTreeNormalizer extends BobChrisTreeNormalizer { /** * Tag provided to words which are extracted from a multi-word token * into their own independent nodes */ public static final String MW_TAG = "MW?"; /** * Tag provided to constituents which contain words from MW tokens */ public static final String MW_PHRASE_TAG = "MW_PHRASE?"; public static final String EMPTY_LEAF_VALUE = "=NONE="; public static final String LEFT_PARENTHESIS = "=LRB="; public static final String RIGHT_PARENTHESIS = "=RRB="; private static final Map<String, String> spellingFixes = new HashMap<>(); static { spellingFixes.put("embargp", "embargo"); // 18381_20000322.tbf-4 spellingFixes.put("jucio", "juicio"); // 4800_2000406.tbf-5 spellingFixes.put("méxico", "México"); // 111_C-3.tbf-17 spellingFixes.put("reirse", "reírse"); // 140_20011102.tbf-13 spellingFixes.put("tambien", "también"); // 41_19991002.tbf-8 spellingFixes.put("Intitute", "Institute"); // 22863_20001129.tbf-16 // Hack: these aren't exactly spelling mistakes, but we need to // run a search-and-replace across the entire corpus with them, so // they should be treated just like spelling mistakes for our // purposes spellingFixes.put("(", LEFT_PARENTHESIS); spellingFixes.put(")", RIGHT_PARENTHESIS); } /** * A filter which rejects preterminal nodes that contain "empty" leaf * nodes. */ private static final Predicate<Tree> emptyFilter = new Predicate<Tree>() { public boolean test(Tree tree) { if (tree.isPreTerminal() && tree.firstChild().value().equals(EMPTY_LEAF_VALUE)) return false; return true; } }; /** * Resolves some inconsistencies in constituent naming: * * - "sa" and "s.a" are equivalent -- merge to "s.a" */ private static final TreeTransformer constituentRenamer = new TreeTransformer() { @Override public Tree transformTree(Tree t) { if (t.isLeaf()) return t; String value = t.value(); if (value == null) return t; if (value.equals("sa")) t.setValue("s.a"); return t; } }; @SuppressWarnings("unchecked") private static final Pair<String, String>[] cleanupStrs = new Pair[] { new Pair("sp < (sp=sp <: prep=prep)", "replace sp prep"), // Left and right parentheses should be at same depth new Pair("fpa > __=grandparent $++ (__=ancestor <<` fpt=fpt >` =grandparent)", "move fpt $- ancestor"), // Nominal groups where adjectival groups belong new Pair("/^s\\.a$/ <: (/^grup\\.nom$/=gn <: /^a/)", "relabel gn /grup.a/"), // Adverbial phrases should always have adverb group children // -- we see about 50 exceptions in the corpus.. new Pair("sadv !< /^grup\\.adv$/ <: /^(rg|neg)$/=adv", "adjoinF (grup.adv foot@) adv"), // 'z' tag should be 'z0' new Pair("z=z <: (__ !< __)", "relabel z z0"), // Conjunction groups aren't necessary if they head single // prepositional phrases (we already see a `conj < sp` pattern; // replicate that new Pair("/^grup\\.c/=grup > conj <: sp=sp", "replace grup sp"), // "Lift up" sentence-final periods which have been nested within // constituents (convention in AnCora is to have sentence-final // periods as final right children of the `sentence` constituent) new Pair("__=N <<` (fp|fs=fp <: (/^\\.$/ !. __)) > sentence=sentence", "move fp $- N"), // AnCora has a few weird parses of "nada que ver" and related // phrases. Normalize them: // // (grup.nom (pi000000 X) (S (relatiu (pr000000 que)) // (infinitiu (vmn0000 Y)))) new Pair("(pi000000 <: __ !$+ S >` (/^grup\\.nom/=gn >` sn=sn))" + ". ((que >: (__=queTag $- =sn)) . (__=vb !< __ >>: (__=vbContainer $- =queTag)))", "[insert (S (relatiu (pr000000 que)) (infinitiu vmn0000=vbFoot)) >-1 gn]" + "[move vb >0 vbFoot]" + "[delete queTag]" + "[delete vbContainer]"), // One more bizarre "nada que ver" new Pair("sn=sn <: (/^grup\\.nom/=gn <<: Nada)" + "$+ (infinitiu=inf <<, que=que <<` (ver , =que) $+ sp=sp)", "[delete inf] [insert (S (relatiu (pr000000 que)) (infinitiu (vmn0000 ver))) >-1 gn]" + "[move sp >-1 sn]"), // Remove date lead-ins new Pair("sentence <<, (sn=sn <, (/^grup\\.w$/ $+ fp))", "delete sn"), // Shed "conj" parents of periods in the middle of trees so that // our splitter can identify sentence boundaries properly new Pair("conj=conj <: fp=fp", "replace conj fp"), // Fix mis-tagging of inverted question mark new Pair("fit=fit <: ¿", "relabel fit fia"), }; private static final List<Pair<TregexPattern, TsurgeonPattern>> cleanup = compilePatterns(cleanupStrs); /** * If one of the constituents in this set has a single child has a * multi-word token, it should be replaced by a node heading the * expanded word leaves rather than simply receive that node as a * child. * * Note that this is only the case for constituents with a *single* * child which is a multi-word token. */ private static final Set<String> mergeWithConstituentWhenPossible = new HashSet<>( Arrays.asList( "grup.adv", "grup.nom", "grup.nom.loc", "grup.nom.org", "grup.nom.otros", "grup.nom.pers", "grup.verb", "spec" )); // Customization private boolean simplifiedTagset; private boolean aggressiveNormalization; private boolean retainNER; public SpanishTreeNormalizer() { this(true, false, false); } public SpanishTreeNormalizer(boolean simplifiedTagset, boolean aggressiveNormalization, boolean retainNER) { super(new SpanishTreebankLanguagePack()); if (retainNER && !simplifiedTagset) throw new IllegalArgumentException("retainNER argument only valid when " + "simplified tagset is used"); this.simplifiedTagset = simplifiedTagset; this.aggressiveNormalization = aggressiveNormalization; this.retainNER = retainNER; } @Override public Tree normalizeWholeTree(Tree tree, TreeFactory tf) { // Begin with some basic transformations tree = tree.prune(emptyFilter).spliceOut(aOverAFilter) .transform(constituentRenamer); // Now start some simple cleanup tree = Tsurgeon.processPatternsOnTree(cleanup, tree); // That might've produced some more A-over-As tree = tree.spliceOut(aOverAFilter); // Find all named entities which are not multi-word tokens and nest // them within named entity NP groups if (retainNER) markSimpleNamedEntities(tree); for (Tree t : tree) { if (simplifiedTagset && t.isPreTerminal()) { // This is a part of speech tag. Remove extra morphological // information. CoreLabel label = (CoreLabel) t.label(); String pos = label.value(); pos = simplifyPOSTag(pos).intern(); label.setValue(pos); label.setTag(pos); } else if (aggressiveNormalization && isMultiWordCandidate(t)) { // Expand multi-word token if necessary normalizeForMultiWord(t, tf); } } // More tregex-powered fixes tree = expandElisions(tree); tree = expandConmigo(tree); tree = expandCliticPronouns(tree); // Make sure the tree has a top-level unary rewrite; the root // should have a proper root label String rootLabel = tlp.startSymbol(); if (!tree.value().equals(rootLabel)) tree = tf.newTreeNode(rootLabel, Collections.singletonList(tree)); return tree; } @Override public String normalizeTerminal(String word) { if (spellingFixes.containsKey(word)) return spellingFixes.get(word); return word; } /** * Return a "simplified" version of an original AnCora part-of-speech * tag, with much morphological annotation information removed. */ private String simplifyPOSTag(String pos) { if (pos.length() == 0) return pos; switch (pos.charAt(0)) { case 'd': // determinant (d) // retain category, type // drop person, gender, number, possessor return pos.substring(0, 2) + "0000"; case 's': // preposition (s) // retain category, type // drop form, gender, number return pos.substring(0, 2) + "000"; case 'p': // pronoun (p) // retain category, type // drop person, gender, number, case, possessor, politeness return pos.substring(0, 2) + "000000"; case 'a': // adjective // retain category, type, grade // drop gender, number, function return pos.substring(0, 3) + "000"; case 'n': // noun // retain category, type, number, NER label // drop type, gender, classification char ner = retainNER && pos.length() == 7 ? pos.charAt(6) : '0'; return pos.substring(0, 2) + '0' + pos.charAt(3) + "00" + ner; case 'v': // verb // retain category, type, mood, tense // drop person, number, gender return pos.substring(0, 4) + "000"; default: // adverb // retain all // punctuation // retain all // numerals // retain all // date and time // retain all // conjunction // retain all return pos; } } /** * Matches a verb with attached pronouns; used in several following * Tregex expressions */ private static final String VERB_LEAF_WITH_PRONOUNS_TREGEX = // Match a leaf that looks like it has a clitic pronoun // Match suffixes of regular forms which may carry attached // pronouns (imperative, gerund, infinitive) "/(?:(?:[aeiáéí]r|[áé]ndo|[aeáé]n?|[aeiáéí](?:d(?!os)|(?=os)))" + // Match irregular imperative stems "|^(?:d[ií]|h[aá]z|v[eé]|p[oó]n|s[aá]l|sé|t[eé]n|v[eé]n|(?:id(?=os$))))" + // Match attached pronouns "(?:(?:(?:[mts]e|n?os|les?)(?:l[oa]s?)?)|l[oa]s?)$/=vb " + // It should actually be a verb (gerund, imperative or // infinitive) // // (Careful: other code that uses this pattern requires that this // node be at the end, with parens so that it can be named / // modified. See e.g. #verbWithCliticPronounAndSiblings) "> (/^vm[gmn]0000$/"; /** * Matches verbs (infinitives, gerunds and imperatives) which have * attached pronouns, and the clauses which contain them */ private static final TregexPattern verbWithCliticPronouns = TregexPattern.compile(VERB_LEAF_WITH_PRONOUNS_TREGEX + // Verb tag should not have siblings in verb // phrase " !$ __)" + // Locate the clause which contains it, and // the child just below that clause ">+(/^[^S]/) (/^(infinitiu|gerundi|grup\\.verb)$/=target " + "> /^(sentence|S|grup\\.verb|infinitiu|gerundi)$/=clause << =vb " + // Make sure we're not up too far in the tree: // there should be no infinitive / gerund / // verb phrase between the located ancestor // and the verb "!<< (/^(infinitiu|gerundi|grup\\.verb)$/ << =vb))"); /** * Matches verbs (infinitives, gerunds and imperatives) which have * attached pronouns and siblings within their containing verb * phrases */ private static final TregexPattern verbWithCliticPronounsAndSiblings = TregexPattern.compile(VERB_LEAF_WITH_PRONOUNS_TREGEX + // Name the matched verb tag as the target for insertion; // require that it have siblings "=target $ __) " + // Locate the clause which contains it, and // the child just below that clause ">+(/^[^S]/) (/^(infinitiu|gerundi|grup\\.verb)$/ " + "> /^(sentence|S|grup\\.verb|infinitiu|gerundi)$/=clause << =vb " + // Make sure we're not up too far in the tree: // there should be no infinitive / gerund / // verb phrase between the located ancestor // and the verb "!<< (/^(infinitiu|gerundi|grup\\.verb)$/ << =vb))"); /** * Matches verbs which really should be in a clause, but were * squeezed into an infinitive constituent (because the pronoun was * attached to the verb, we could just pretend it wasn't a clause.. * not anymore!) */ private static final TregexPattern clauselessVerbWithCliticPronouns = TregexPattern.compile( VERB_LEAF_WITH_PRONOUNS_TREGEX + ") > (/^vmn/ > (/^infinitiu$/=target > /^sp$/))" ); private static final TsurgeonPattern clausifyVerbWithCliticPronouns = Tsurgeon.parseOperation("adjoinF (S foot@) target"); private static final SpanishVerbStripper verbStripper = SpanishVerbStripper.getInstance(); /** * Separate clitic pronouns into their own tokens in the given tree. * (The clitic pronouns are attached under new `grup.nom` constituents * which follow the verbs to which they were formerly attached.) */ private static Tree expandCliticPronouns(Tree t) { // Perform some cleanup first -- we want to match as many // clitic-attached verbs as possible.. t = Tsurgeon.processPattern(clauselessVerbWithCliticPronouns, clausifyVerbWithCliticPronouns, t); // Run two separate stages: one for only-child VPs, then another // for VP children which have siblings t = expandCliticPronounsInner(t, verbWithCliticPronouns); t = expandCliticPronounsInner(t, verbWithCliticPronounsAndSiblings); return t; } /** * Expand clitic pronouns on verbs matching the given pattern. */ private static Tree expandCliticPronounsInner(Tree t, TregexPattern pattern) { TregexMatcher matcher = pattern.matcher(t); while (matcher.find()) { Tree verbNode = matcher.getNode("vb"); String verb = verbNode.value(); if (!SpanishVerbStripper.isStrippable(verb)) continue; SpanishVerbStripper.StrippedVerb split = verbStripper.separatePronouns(verb); if (split == null) continue; // Retrieve some context for the pronoun disambiguator: take the // matched clause and walk (at most) two constituents up StringBuilder clauseYieldBuilder = new StringBuilder(); for (Label label : matcher.getNode("clause").yield()) clauseYieldBuilder.append(label.value()).append(" "); String clauseYield = clauseYieldBuilder.toString(); clauseYield = clauseYield.substring(0, clauseYield.length() - 1); // Insert clitic pronouns as leaves of pronominal phrases which are // siblings of `target`. Iterate in reverse order since pronouns are // attached to immediate right of `target` List<String> pronouns = split.getPronouns(); for (int i = pronouns.size() - 1; i >= 0; i--) { String pronoun = pronouns.get(i); String newTreeStr = null; if (AnCoraPronounDisambiguator.isAmbiguous(pronoun)) { AnCoraPronounDisambiguator.PersonalPronounType type = AnCoraPronounDisambiguator.disambiguatePersonalPronoun(split, i, clauseYield); switch (type) { case OBJECT: newTreeStr = "(sn (grup.nom (pp000000 %s)))"; break; case REFLEXIVE: newTreeStr = "(morfema.pronominal (pp000000 %s))"; break; case UNKNOWN: // Mark for manual disambiguation newTreeStr = "(PRONOUN? (pp000000 %s))"; break; } } else { // Unambiguous clitic pronouns are all indirect / direct // object pronouns.. convenient! newTreeStr = "(sn (grup.nom (pp000000 %s)))"; } String patternString = "[insert " + String.format(newTreeStr, pronoun) + " $- target]"; TsurgeonPattern insertPattern = Tsurgeon.parseOperation(patternString); t = insertPattern.matcher().evaluate(t, matcher); } TsurgeonPattern relabelOperation = Tsurgeon.parseOperation(String.format("[relabel vb /%s/]", split.getStem())); t = relabelOperation.matcher().evaluate(t, matcher); } return t; } private static final List<Pair<TregexPattern, TsurgeonPattern>> markSimpleNEs; // Generate some reusable patterns for four different NE groups static { @SuppressWarnings("unchecked") Pair<String, String>[] patternTemplates = new Pair[] { // NE as only child of a `grup.nom` new Pair("/^grup\\.nom$/=target <: (/np0000%c/ < __)", "[relabel target /grup.nom.%s/]"), // NE as child with a right sibling in a `grup.nom` new Pair("/^grup\\.nom$/ < ((/np0000%c/=target < __) $+ __)", "[adjoinF (grup.nom.%s foot@) target]"), // NE as child with a left sibling in a `grup.nom` new Pair("/^grup\\.nom$/ < ((/np0000%c/=target < __) $- __)", "[adjoinF (grup.nom.%s foot@) target]") }; // Pairs tagset annotation codes with the annotations used in our // constituents @SuppressWarnings("unchecked") Pair<Character, String>[] namedEntityTypes = new Pair[] { new Pair('0', "otros"), // other new Pair('l', "lug"), // place new Pair('o', "org"), // location new Pair('p', "pers"), // person }; markSimpleNEs = new ArrayList<>(patternTemplates.length * namedEntityTypes.length); for (Pair<String, String> template : patternTemplates) { for (Pair<Character, String> namedEntityType : namedEntityTypes) { String tregex = String.format(template.first(), namedEntityType.first()); String tsurgeon = String.format(template.second(), namedEntityType.second()); markSimpleNEs.add(new Pair<>(TregexPattern.compile(tregex), Tsurgeon.parseOperation(tsurgeon))); } } }; /** * Find all named entities which are not multi-word tokens and nest * them in named entity NP groups (`grup.nom.{lug,org,pers,otros}`). * * Do this only for "simple" NEs: the multi-word NEs have to be done * at a later step in `MultiWordPreprocessor`. */ void markSimpleNamedEntities(Tree t) { Tsurgeon.processPatternsOnTree(markSimpleNEs, t); } /** * Determine whether the given tree node is a multi-word token * expansion candidate. (True if the node has at least one grandchild * which is a leaf node.) */ boolean isMultiWordCandidate(Tree t) { for (Tree child : t.children()) for (Tree grandchild : child.children()) if (grandchild.isLeaf()) return true; return false; } /** * Normalize a pre-pre-terminal tree node by accounting for multi-word * tokens. * * Detects multi-word tokens in leaves below this pre-pre-terminal and * expands their constituent words into separate leaves. */ void normalizeForMultiWord(Tree t, TreeFactory tf) { Tree[] preterminals = t.children(); for (int i = 0; i < preterminals.length; i++) { // This particular child is not actually a preterminal --- skip if (!preterminals[i].isPreTerminal()) continue; Tree leaf = preterminals[i].firstChild(); String leafValue = ((CoreLabel) leaf.label()).value(); String[] words = getMultiWords(leafValue); if (words.length == 1) continue; // Leaf is a multi-word token; build new nodes for each of its // constituent words List<Tree> newNodes = new ArrayList<>(words.length); for (String word1 : words) { String word = normalizeTerminal(word1); Tree newLeaf = tf.newLeaf(word); if (newLeaf.label() instanceof HasWord) ((HasWord) newLeaf.label()).setWord(word); Tree newNode = tf.newTreeNode(MW_TAG, Arrays.asList(newLeaf)); if (newNode.label() instanceof HasTag) ((HasTag) newNode.label()).setTag(MW_TAG); newNodes.add(newNode); } // Value of the phrase which should head these preterminals. Mark // that this was created from a multiword token, and also retain // the original parts of speech. String phraseValue = MW_PHRASE_TAG + "_" + simplifyPOSTag(preterminals[i].value()); // Should we insert these new nodes as children of the parent `t` // (i.e., "merge" the multi-word token phrase into its parent), or // head them with a new node and set that as a child of the // parent? boolean shouldMerge = preterminals.length == 1 && mergeWithConstituentWhenPossible.contains(t.value()); if (shouldMerge) { t.setChildren(newNodes); t.setValue(phraseValue); } else { Tree newHead = tf.newTreeNode(phraseValue, newNodes); t.setChild(i, newHead); } } } private static final Pattern pQuoted = Pattern.compile("\"(.+)\""); /** * Strings of punctuation which should remain a single token. */ private static final Pattern pPunct = Pattern.compile("[.,!?:/'=()-]+"); /** * Characters which may separate words in a single token. */ private static final String WORD_SEPARATORS = ",-_¡!¿?()/%"; /** * Word separators which should not be treated as separate "words" and * dropped from a multi-word token. */ private static final String WORD_SEPARATORS_DROP = "_"; /** * These bound morphemes should not be separated from the words with * which they are joined by hyphen. */ // TODO how to handle clitics? chino-japonés private static final Set<String> hyphenBoundMorphemes = new HashSet<>(Arrays.asList( "anti", // anti-Gil "co", // co-promotora "ex", // ex-diputado "meso", // meso-americano "neo", // neo-proteccionismo "pre", // pre-presidencia "pro", // pro-indonesias "quasi", // quasi-unidimensional "re", // re-flotamiento "semi", // semi-negro "sub" // sub-18 )); /** * Prepare the given token for multi-word detection / extraction. * * This method makes up for some various oddities in corpus annotations. */ private String prepareForMultiWordExtraction(String token) { return token.replaceAll("-fpa-", "(").replaceAll("-fpt-", ")"); } /** * Return the (single or multiple) words which make up the given * token. * * TODO can't SpanishTokenizer handle most of this? */ private String[] getMultiWords(String token) { token = prepareForMultiWordExtraction(token); Matcher punctMatcher = pPunct.matcher(token); if (punctMatcher.matches()) return new String[] {token}; Matcher quoteMatcher = pQuoted.matcher(token); if (quoteMatcher.matches()) { String[] ret = new String[3]; ret[0] = "\""; ret[1] = quoteMatcher.group(1); ret[2] = "\""; return ret; } // Confusing: we are using a tokenizer to split a token into its // constituent words StringTokenizer splitter = new StringTokenizer(token, WORD_SEPARATORS, true); int remainingTokens = splitter.countTokens(); List<String> words = new ArrayList<>(); while (splitter.hasMoreTokens()) { String word = splitter.nextToken(); remainingTokens--; if (shouldDropWord(word)) // This is a delimiter that we should drop continue; if (remainingTokens >= 2 && hyphenBoundMorphemes.contains(word)) { String hyphen = splitter.nextToken(); remainingTokens--; if (!hyphen.equals("-")) { // Ouch. We expected a hyphen here. Clean things up and keep // moving. words.add(word); if (!shouldDropWord(hyphen)) words.add(hyphen); continue; } String freeMorpheme = splitter.nextToken(); remainingTokens--; words.add(word + hyphen + freeMorpheme); continue; } else if (word.equals(",") && remainingTokens >= 1 && words.size() > 0) { int prevIndex = words.size() - 1; String prevWord = words.get(prevIndex); if (StringUtils.isNumeric(prevWord)) { String nextWord = splitter.nextToken(); remainingTokens--; if (StringUtils.isNumeric(nextWord)) { words.set(prevIndex, prevWord + ',' + nextWord); } else { // Expected a number here.. clean up and move on words.add(word); words.add(nextWord); } continue; } } // Otherwise.. words.add(word); } return words.toArray(new String[words.size()]); } /** * Determine if the given "word" which is part of a multiword token * should be dropped. */ private boolean shouldDropWord(String word) { return word.length() == 1 && WORD_SEPARATORS_DROP.indexOf(word.charAt(0)) != -1; } /** * Expand grandchild tokens which are elided forms of multi-word * expressions ('al,' 'del'). * * We perform this expansion separately from multi-word expansion * because we follow special rules about where the expanded tokens * should be placed in the case of elision. * * @param t Tree representing an entire sentence */ private Tree expandElisions(Tree t) { return Tsurgeon.processPatternsOnTree(elisionExpansions, t); } @SuppressWarnings("unchecked") private static final Pair<String, String>[] elisionExpansionStrs = new Pair[] { // Elided forms with a `prep` ancestor which has an `sn` phrase as a // right sibling new Pair(// Search for `sn` which is right sibling of closest `prep` // ancestor to the elided node; cascade down tree to lowest `sn` "/^(prep|sadv|conj)$/ <+(/^(prep|grup\\.(adv|cc|prep))$/) (sp000=sp < /(?i)^(del|al)$/=elided) <<` =sp " + "$+ (sn > (__ <+(sn) (sn=sn !< sn) << =sn) !$- sn)", // Insert the 'el' specifier as a constituent in adjacent // noun phrase "[relabel elided /(?i)l//] [insert (spec (da0000 el)) >0 sn]"), // Prepositional forms with a `prep` grandparent which has a // `grup.nom` phrase as a right sibling new Pair("prep < (sp000 < /(?i)^(del|al)$/=elided) $+ /grup\\.nom/=target", "[relabel elided /(?i)l//] " + "[adjoinF (sn (spec (da0000 el)) foot@) target]"), // Elided forms with a `prep` ancestor which has an adjectival // phrase as a right sibling ('al segundo', etc.) new Pair("prep < (sp000 < /(?i)^(del|al)$/=elided) $+ /s\\.a/=target", "[relabel elided /(?i)l//] " + // Turn neighboring adjectival phrase into a noun phrase, // adjoining original adj phrase beneath a `grup.nom` "[adjoinF (sn (spec (da0000 el)) (grup.nom foot@)) target]"), // "del que golpea:" insert 'el' as specifier into adjacent relative // phrase new Pair("sp < (prep=prep < (sp000 < /(?i)^(a|de)l$/=elided) $+ " + "(S=S <<, relatiu))", // Build a noun phrase in the neighboring relative clause // containing the 'el' specifier "[relabel elided /(?i)l//] " + "[adjoinF (sn (spec (da0000 el)) (grup.nom foot@)) S]"), // "al" + infinitive phrase new Pair("prep < (sp000 < /(?i)^(al|del)$/=elided) $+ " + // Looking for an infinitive directly to the right of the // "al" token, nested within one or more clause // constituents "(S=target <+(S) infinitiu=inf <<, =inf)", "[relabel elided /(?i)l//] " + "[adjoinF (sn (spec (da0000 el)) foot@) target]"), // "al no" + infinitive phrase new Pair("prep < (sp000 < /(?i)^al$/=elided) $+ (S=target <, neg <2 infinitiu)", "[relabel elided a] " + "[adjoinF (sn (spec (da0000 el)) foot@) target]"), // "al que quisimos tanto" new Pair("prep < (sp000 < /(?i)^al$/=elided) $+ relatiu=target", "[relabel elided a] " + "[adjoinF (sn (spec (da0000 el)) foot@) target]"), // "al de" etc. new Pair("prep < (sp000 < /(?i)^al$/=elided) $+ (sp=target <, prep)", "[relabel elided a] " + "[adjoinF (sn (spec (da0000 el)) (grup.nom foot@)) target]"), // leading adjective in sibling: "al chileno Fernando" new Pair("prep < (sp000 < /(?i)^(del|al)$/=elided) $+ " + "(/grup\\.nom/=target <, /s\\.a/ <2 /sn|nc0[sp]000/)", "[relabel elided /(?i)l//] " + "[adjoinF (sn (spec (da0000 el)) foot@) target]"), // "al" + phrase begun by participle -> "a lo <participle>" // e.g. "al conseguido" -> "a lo conseguido" new Pair("prep < (sp000 < /(?i)^(al|del)$/=elided) $+ (S=target < participi)", "[relabel elided /(?i)l//] " + "[adjoinF (sn (spec (da0000 lo)) foot@) target]"), // "del" used within specifier; e.g. "más del 30 por ciento" new Pair("spec < (sp000=target < /(?i)^del$/=elided) > sn $+ /grup\\.nom/", "[relabel elided /(?i)l//] " + "[insert (da0000 el) $- target]"), // "del," "al" in date phrases: "1 de enero del 2001" new Pair("sp000=kill < /(?i)^(del|al)$/ $+ w=target", "[delete kill] " + "[adjoinF (sp (prep (sp000 de)) (sn (spec (da0000 el)) foot@)) target]"), // "a favor del X," "en torno al Y": very common (and somewhat // complex) phrase structure that we can match new Pair("sp000 < /(?i)^(a|de)l$/=contraction >: (prep >` (/^grup\\.prep$/ " + ">` (prep=prep > sp $+ (sn=sn <, /^grup\\.(nom|[wz])/))))", "[relabel contraction /(?i)l//] [insert (spec (da0000 el)) >0 sn]"), // "en vez del X": same as above, except prepositional phrase // functions as conjunction (and is labeled as such) new Pair("sp000 < /(?i)^(a|de)l$/=contraction >: (prep >` (sp >: (conj $+ (sn=sn <, /^grup\\.(nom|[wz])/))))", "[relabel contraction /(?i)l//] [insert (spec (da0000 el)) >0 sn]"), // "a favor del X," "en torno al Y" where X, Y are doubly nested // substantives new Pair("sp000 < /(?i)^(a|de)l$/=contraction >: (prep >` (/^grup\\.prep$/ " + ">` (prep=prep > sp $+ (sn <, (sn=sn <, /^grup\\.(nom|[wz])/)))))", "[relabel contraction /(?i)l//] [insert (spec (da0000 el)) >0 sn]"), // "a favor del X," "en torno al Y" where X, Y already have // leading specifiers new Pair("sp000 < /(?i)^(a|de)l$/=contraction >: (prep >` (/^grup\\.prep$/ " + ">` (prep > sp $+ (sn=sn <, spec=spec))))", "[relabel contraction /(?i)l//] [insert (da0000 el) >0 spec]"), // "a favor del X," "en torno al Y" where X, Y are nominal // groups (not substantives) new Pair("sp000 < /(?i)^(a|de)l$/=contraction >: (prep >` (/^grup\\.prep$/ " + ">` (prep > sp $+ /^grup\\.(nom|[wz])$/=ng)))", "[adjoinF (sn (spec (da0000 el)) foot@) ng] [relabel contraction /(?i)l//]"), // "al," "del" as part of coordinating conjunction: "frente al," // "además del" // // (nearby noun phrase labeled as nominal group) new Pair("sp000 < /(?i)^(de|a)l$/=elided >` (/^grup\\.cc$/ >: (conj $+ /^grup\\.nom/=gn))", "[relabel elided /(?i)l//] [adjoinF (sn (spec (da0000 el)) foot@) gn]"), // "al" + participle in adverbial phrase: "al contado," "al descubierto" new Pair("sp000=sp < /(?i)^al$/=elided $+ /^vmp/", "[relabel elided /(?i)l//] [insert (da0000 el) $- sp]"), // über-special case: 15021_20000218.tbf-5 // // intentional: article should bind all of quoted phrase, even // though there are multiple clauses (kind of a crazy sentence) new Pair("prep < (sp000 < /(?i)^(al|del)$/=elided) $+ (S=S <+(S) (/^f/=punct $+ (S <+(S) (S <, infinitiu))))", "[relabel elided /(?i)l//] [adjoinF (sn (spec (da0000 el)) (grup.nom foot@)) S]"), // special case: "del todo" -> "de el todo" (flat) new Pair("__=sp < del=contraction >, __=parent $+ (__ < todo >` =parent)", "[relabel contraction de] [insert (da0000 el) $- sp]"), }; private static final List<Pair<TregexPattern, TsurgeonPattern>> elisionExpansions = compilePatterns(elisionExpansionStrs); private static TregexPattern conmigoPattern = TregexPattern.compile("/(?i)^con[mst]igo$/=conmigo > (/^pp/ > (/^grup\\.nom$/ > sn=sn))"); /** * ¡Venga, expand conmigo! */ private static Tree expandConmigo(Tree t) { TregexMatcher matcher = conmigoPattern.matcher(t); while (matcher.find()) { Tree conmigoNode = matcher.getNode("conmigo"); String word = conmigoNode.value(); String newPronoun = null; if (word.equalsIgnoreCase("conmigo")) newPronoun = "mí"; else if (word.equalsIgnoreCase("contigo")) newPronoun = "ti"; else if (word.equalsIgnoreCase("consigo")) newPronoun = "sí"; if (word.charAt(0) == 'C') newPronoun = newPronoun.toUpperCase(); String tsurgeon = String.format( "[relabel conmigo /%s/]" + "[adjoinF (sp (prep (sp000 con)) foot@) sn]", newPronoun); TsurgeonPattern pattern = Tsurgeon.parseOperation(tsurgeon); t = pattern.matcher().evaluate(t, matcher); } return t; } private static List<Pair<TregexPattern, TsurgeonPattern>> compilePatterns(Pair<String, String>[] patterns) { List<Pair<TregexPattern, TsurgeonPattern>> ret = new ArrayList<>(patterns.length); for (Pair<String, String> pattern : patterns) ret.add(new Pair<>(TregexPattern.compile(pattern.first()), Tsurgeon.parseOperation(pattern.second()))); return ret; } }